Add the GPU rigid body pipeline from https://github.com/erwincoumans/experiments as a Bullet 3.x preview for Bullet 2.80

2012-03-05 00:54:32 +00:00
parent 73c4646b40
commit 571af41cf6
257 changed files with 55106 additions and 0 deletions
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.h
@@ -0,0 +1,73 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+
+namespace adl
+{
+
+class CopyBase
+{
+	public:
+		enum Option
+		{
+			PER_WI_1, 
+			PER_WI_2, 
+			PER_WI_4, 
+		};
+};
+
+template<DeviceType TYPE>
+class Copy : public CopyBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			const Device* m_device;
+			Kernel* m_copy1F4Kernel;
+			Kernel* m_copy2F4Kernel;
+			Kernel* m_copy4F4Kernel;
+			Kernel* m_copyF1Kernel;
+			Kernel* m_copyF2Kernel;
+			Buffer<int4>* m_constBuffer;
+		};
+
+		static
+		Data* allocate(const Device* deviceData);
+
+		static
+		void deallocate(Data* data);
+
+		static
+		void execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option = PER_WI_1);
+
+		static
+		void execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n);
+
+		static
+		void execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n);
+};
+
+
+#include <AdlPrimitives/Copy/CopyHost.inl>
+#include <AdlPrimitives/Copy/Copy.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.inl
@@ -0,0 +1,151 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Copy\\CopyKernels"
+#define KERNEL0 "Copy1F4Kernel"
+#define KERNEL1 "Copy2F4Kernel"
+#define KERNEL2 "Copy4F4Kernel"
+#define KERNEL3 "CopyF1Kernel"
+#define KERNEL4 "CopyF2Kernel"
+
+#include <AdlPrimitives/Copy/CopyKernelsCL.h>
+#include <AdlPrimitives/Copy/CopyKernelsDX11.h>
+
+
+template<DeviceType TYPE>
+typename Copy<TYPE>::Data* Copy<TYPE>::allocate( const Device* device )
+{
+	ADLASSERT( TYPE == device->m_type );
+
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+	{copyKernelsCL, copyKernelsDX11};
+//	ADLASSERT(0);
+#else
+	{0,0};
+#endif	
+
+	Data* data = new Data;
+	data->m_device = device;
+	data->m_copy1F4Kernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
+	data->m_copy2F4Kernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
+	data->m_copy4F4Kernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
+	data->m_copyF1Kernel = device->getKernel( PATH, KERNEL3, 0, src[TYPE] );
+	data->m_copyF2Kernel = device->getKernel( PATH, KERNEL4, 0, src[TYPE] );
+	data->m_constBuffer = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::deallocate( Data* data )
+{
+	delete data->m_constBuffer;
+	delete data;
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option )
+{
+	ADLASSERT( TYPE == dst.getType() );
+	ADLASSERT( TYPE == src.getType() );
+
+	int4 constBuffer;
+	constBuffer.x = n;
+
+	switch (option)
+	{
+	case PER_WI_1:
+		{
+			BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+			Launcher launcher( data->m_device, data->m_copy1F4Kernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( n/1 );
+		}
+		break;
+	case PER_WI_2:
+		{
+			ADLASSERT( n%2 == 0 );
+			BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+			Launcher launcher( data->m_device, data->m_copy2F4Kernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( n/2 );
+		}
+		break;
+	case PER_WI_4:
+		{
+			ADLASSERT( n%4 == 0 );
+			BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+			Launcher launcher( data->m_device, data->m_copy4F4Kernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( n/4 );
+		}
+		break;
+	default:
+		ADLASSERT(0);
+		break;
+	};
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n )
+{
+	ADLASSERT( TYPE == dst.getType() );
+	ADLASSERT( TYPE == src.getType() );
+
+	int4 constBuffer;
+	constBuffer.x = n;
+
+	BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+	Launcher launcher( data->m_device, data->m_copyF2Kernel );
+	launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+	launcher.setConst( *data->m_constBuffer, constBuffer );
+	launcher.launch1D( n/1 );
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n )
+{
+	ADLASSERT( TYPE == dst.getType() );
+	ADLASSERT( TYPE == src.getType() );
+
+	int4 constBuffer;
+	constBuffer.x = n;
+
+	BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+	Launcher launcher( data->m_device, data->m_copyF1Kernel );
+	launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+	launcher.setConst( *data->m_constBuffer, constBuffer );
+	launcher.launch1D( n/1 );
+}
+
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
+#undef KERNEL3
+#undef KERNEL4
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyHost.inl
@@ -0,0 +1,85 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+template<>
+class Copy<TYPE_HOST> : public CopyBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+		};
+
+		static
+		Data* allocate(const Device* deviceData)
+		{
+			ADLASSERT( TYPE_HOST == deviceData->m_type );
+			return 0;
+		}
+
+		static
+		void deallocate(Data* data)
+		{
+			return;
+		}
+
+		static
+		void execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option = PER_WI_1)
+		{
+			ADLASSERT( TYPE_HOST == dst.getType() );
+			ADLASSERT( TYPE_HOST == src.getType() );
+
+			HostBuffer<float4>& dstH = (HostBuffer<float4>&)dst;
+			HostBuffer<float4>& srcH = (HostBuffer<float4>&)src;
+
+			for(int i=0; i<n; i++)
+			{
+				dstH[i] = srcH[i];
+			}
+		}
+
+		static
+		void execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n)
+		{
+			ADLASSERT( TYPE_HOST == dst.getType() );
+			ADLASSERT( TYPE_HOST == src.getType() );
+
+			HostBuffer<float2>& dstH = (HostBuffer<float2>&)dst;
+			HostBuffer<float2>& srcH = (HostBuffer<float2>&)src;
+
+			for(int i=0; i<n; i++)
+			{
+				dstH[i] = srcH[i];
+			}
+		}
+
+		static
+		void execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n)
+		{
+			ADLASSERT( TYPE_HOST == dst.getType() );
+			ADLASSERT( TYPE_HOST == src.getType() );
+
+			HostBuffer<float>& dstH = (HostBuffer<float>&)dst;
+			HostBuffer<float>& srcH = (HostBuffer<float>&)src;
+
+			for(int i=0; i<n; i++)
+			{
+				dstH[i] = srcH[i];
+			}
+		}
+};
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernels.cl
@@ -0,0 +1,128 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+
+#define make_uint4 (uint4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+typedef struct
+{
+	int m_n;
+	int m_padding[3];
+} ConstBuffer;
+
+
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void Copy1F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		float4 a0 = src[gIdx];
+
+		dst[ gIdx ] = a0;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void Copy2F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( 2*gIdx <= cb.m_n )
+	{
+		float4 a0 = src[gIdx*2+0];
+		float4 a1 = src[gIdx*2+1];
+
+		dst[ gIdx*2+0 ] = a0;
+		dst[ gIdx*2+1 ] = a1;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void Copy4F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( 4*gIdx <= cb.m_n )
+	{
+		int idx0 = gIdx*4+0;
+		int idx1 = gIdx*4+1;
+		int idx2 = gIdx*4+2;
+		int idx3 = gIdx*4+3;
+
+		float4 a0 = src[idx0];
+		float4 a1 = src[idx1];
+		float4 a2 = src[idx2];
+		float4 a3 = src[idx3];
+
+		dst[ idx0 ] = a0;
+		dst[ idx1 ] = a1;
+		dst[ idx2 ] = a2;
+		dst[ idx3 ] = a3;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void CopyF1Kernel(__global float* dstF1, __global float* srcF1, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		float a0 = srcF1[gIdx];
+
+		dstF1[ gIdx ] = a0;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		float2 a0 = srcF2[gIdx];
+
+		dstF2[ gIdx ] = a0;
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernels.hlsl
@@ -0,0 +1,130 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+#define GROUP_MEM_FENCE
+#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
+#define AtomInc(x) InterlockedAdd(x, 1)
+#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
+
+#define make_uint4 uint4
+#define make_uint2 uint2
+#define make_int2 int2
+
+#define WG_SIZE 64
+
+#define GET_GROUP_SIZE WG_SIZE
+
+
+
+cbuffer CB : register( b0 )
+{
+	int m_n;
+	int m_padding[3];
+};
+
+RWStructuredBuffer<float4> dst : register( u0 );
+StructuredBuffer<float4> src : register( t0 );
+
+[numthreads(WG_SIZE, 1, 1)]
+void Copy1F4Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		float4 a0 = src[gIdx];
+
+		dst[ gIdx ] = a0;
+	}
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void Copy2F4Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( 2*gIdx <= m_n )
+	{
+		float4 a0 = src[gIdx*2+0];
+		float4 a1 = src[gIdx*2+1];
+
+		dst[ gIdx*2+0 ] = a0;
+		dst[ gIdx*2+1 ] = a1;
+	}
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void Copy4F4Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( 4*gIdx <= m_n )
+	{
+		int idx0 = gIdx*4+0;
+		int idx1 = gIdx*4+1;
+		int idx2 = gIdx*4+2;
+		int idx3 = gIdx*4+3;
+
+		float4 a0 = src[idx0];
+		float4 a1 = src[idx1];
+		float4 a2 = src[idx2];
+		float4 a3 = src[idx3];
+
+		dst[ idx0 ] = a0;
+		dst[ idx1 ] = a1;
+		dst[ idx2 ] = a2;
+		dst[ idx3 ] = a3;
+	}
+}
+
+RWStructuredBuffer<float> dstF1 : register( u0 );
+StructuredBuffer<float> srcF1 : register( t0 );
+
+[numthreads(WG_SIZE, 1, 1)]
+void CopyF1Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		float a0 = srcF1[gIdx];
+
+		dstF1[ gIdx ] = a0;
+	}
+
+}
+
+RWStructuredBuffer<float2> dstF2 : register( u0 );
+StructuredBuffer<float2> srcF2 : register( t0 );
+
+[numthreads(WG_SIZE, 1, 1)]
+void CopyF2Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		float2 a0 = srcF2[gIdx];
+
+		dstF2[ gIdx ] = a0;
+	}
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernelsCL.h
@@ -0,0 +1,119 @@
+static const char* copyKernelsCL= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"\n"
+"#define make_uint4 (uint4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	int m_n;\n"
+"	int m_padding[3];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx];\n"
+"\n"
+"		dst[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( 2*gIdx <= cb.m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx*2+0];\n"
+"		float4 a1 = src[gIdx*2+1];\n"
+"\n"
+"		dst[ gIdx*2+0 ] = a0;\n"
+"		dst[ gIdx*2+1 ] = a1;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( 4*gIdx <= cb.m_n )\n"
+"	{\n"
+"		int idx0 = gIdx*4+0;\n"
+"		int idx1 = gIdx*4+1;\n"
+"		int idx2 = gIdx*4+2;\n"
+"		int idx3 = gIdx*4+3;\n"
+"\n"
+"		float4 a0 = src[idx0];\n"
+"		float4 a1 = src[idx1];\n"
+"		float4 a2 = src[idx2];\n"
+"		float4 a3 = src[idx3];\n"
+"\n"
+"		dst[ idx0 ] = a0;\n"
+"		dst[ idx1 ] = a1;\n"
+"		dst[ idx2 ] = a2;\n"
+"		dst[ idx3 ] = a3;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float a0 = srcF1[gIdx];\n"
+"\n"
+"		dstF1[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float2 a0 = srcF2[gIdx];\n"
+"\n"
+"		dstF2[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernelsDX11.h
@@ -0,0 +1,120 @@
+static const char* copyKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"#define GROUP_MEM_FENCE\n"
+"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+"\n"
+"#define make_uint4 uint4\n"
+"#define make_uint2 uint2\n"
+"#define make_int2 int2\n"
+"\n"
+"#define WG_SIZE 64\n"
+"\n"
+"#define GET_GROUP_SIZE WG_SIZE\n"
+"\n"
+"\n"
+"\n"
+"cbuffer CB : register( b0 )\n"
+"{\n"
+"	int m_n;\n"
+"	int m_padding[3];\n"
+"};\n"
+"\n"
+"RWStructuredBuffer<float4> dst : register( u0 );\n"
+"StructuredBuffer<float4> src : register( t0 );\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void Copy1F4Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx];\n"
+"\n"
+"		dst[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void Copy2F4Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( 2*gIdx <= m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx*2+0];\n"
+"		float4 a1 = src[gIdx*2+1];\n"
+"\n"
+"		dst[ gIdx*2+0 ] = a0;\n"
+"		dst[ gIdx*2+1 ] = a1;\n"
+"	}\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void Copy4F4Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( 4*gIdx <= m_n )\n"
+"	{\n"
+"		int idx0 = gIdx*4+0;\n"
+"		int idx1 = gIdx*4+1;\n"
+"		int idx2 = gIdx*4+2;\n"
+"		int idx3 = gIdx*4+3;\n"
+"\n"
+"		float4 a0 = src[idx0];\n"
+"		float4 a1 = src[idx1];\n"
+"		float4 a2 = src[idx2];\n"
+"		float4 a3 = src[idx3];\n"
+"\n"
+"		dst[ idx0 ] = a0;\n"
+"		dst[ idx1 ] = a1;\n"
+"		dst[ idx2 ] = a2;\n"
+"		dst[ idx3 ] = a3;\n"
+"	}\n"
+"}\n"
+"\n"
+"RWStructuredBuffer<float> dstF1 : register( u0 );\n"
+"StructuredBuffer<float> srcF1 : register( t0 );\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void CopyF1Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		float a0 = srcF1[gIdx];\n"
+"\n"
+"		dstF1[ gIdx ] = a0;\n"
+"	}\n"
+"\n"
+"}\n"
+"\n"
+"RWStructuredBuffer<float2> dstF2 : register( u0 );\n"
+"StructuredBuffer<float2> srcF2 : register( t0 );\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void CopyF2Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		float2 a0 = srcF2[gIdx];\n"
+"\n"
+"		dstF2[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/Fill.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/Fill.h
@@ -0,0 +1,77 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+
+namespace adl
+{
+
+class FillBase
+{
+	public:
+		enum Option
+		{
+
+		};
+};
+
+template<DeviceType TYPE>
+class Fill
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct ConstData
+		{
+			int4 m_data;
+			int m_offset;
+			int m_n;
+			int m_padding[2];
+		};
+
+		struct Data
+		{
+			const Device* m_device;
+			Kernel* m_fillIntKernel;
+			Kernel* m_fillInt2Kernel;
+			Kernel* m_fillInt4Kernel;
+			Buffer<ConstData>* m_constBuffer;
+		};
+
+		static
+		Data* allocate(const Device* deviceData);
+
+		static
+		void deallocate(Data* data);
+
+		static
+		void execute(Data* data, Buffer<int>& src, const int& value, int n, int offset = 0);
+
+		static
+		void execute(Data* data, Buffer<int2>& src, const int2& value, int n, int offset = 0);
+
+		static
+		void execute(Data* data, Buffer<int4>& src, const int4& value, int n, int offset = 0);
+
+};
+
+
+#include <AdlPrimitives/Fill/FillHost.inl>
+#include <AdlPrimitives/Fill/Fill.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/Fill.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/Fill.inl
@@ -0,0 +1,123 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+//#define PATH "..\\..\\AdlPrimitives\\Fill\\FillKernels"
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Fill\\FillKernels"
+#define KERNEL0 "FillIntKernel"
+#define KERNEL1 "FillInt2Kernel"
+#define KERNEL2 "FillInt4Kernel"
+
+#include <AdlPrimitives/Fill/FillKernelsCL.h>
+#include <AdlPrimitives/Fill/FillKernelsDX11.h>
+
+
+template<DeviceType TYPE>
+typename Fill<TYPE>::Data* Fill<TYPE>::allocate( const Device* device )
+{
+	ADLASSERT( TYPE == device->m_type );
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{fillKernelsCL, fillKernelsDX11};
+#else
+		{0,0};
+#endif
+
+	Data* data = new Data;
+	data->m_device = device;
+	data->m_fillIntKernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
+	data->m_fillInt2Kernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
+	data->m_fillInt4Kernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
+	data->m_constBuffer = new Buffer<ConstData>( device, 1, BufferBase::BUFFER_CONST );
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void Fill<TYPE>::deallocate( Data* data )
+{
+	delete data->m_constBuffer;
+	delete data;
+}
+
+template<DeviceType TYPE>
+void Fill<TYPE>::execute(Data* data, Buffer<int>& src, const int& value, int n, int offset)
+{
+	ADLASSERT( n>0 );
+	ConstData constBuffer;
+	{
+		constBuffer.m_offset = offset;
+		constBuffer.m_n = n;
+		constBuffer.m_data = make_int4( value );
+	}
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( &src ) };
+
+		Launcher launcher( data->m_device, data->m_fillIntKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer, constBuffer );
+		launcher.launch1D( n );
+	}
+}
+
+template<DeviceType TYPE>
+void Fill<TYPE>::execute(Data* data, Buffer<int2>& src, const int2& value, int n, int offset)
+{
+	ADLASSERT( n>0 );
+	ConstData constBuffer;
+	{
+		constBuffer.m_offset = offset;
+		constBuffer.m_n = n;
+		constBuffer.m_data = make_int4( value.x, value.y, 0, 0 );
+	}
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( &src ) };
+
+		Launcher launcher( data->m_device, data->m_fillInt2Kernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer, constBuffer );
+		launcher.launch1D( n );
+	}
+}
+
+template<DeviceType TYPE>
+void Fill<TYPE>::execute(Data* data, Buffer<int4>& src, const int4& value, int n, int offset)
+{
+	ADLASSERT( n>0 );
+	ConstData constBuffer;
+	{
+		constBuffer.m_offset = offset;
+		constBuffer.m_n = n;
+		constBuffer.m_data = value;
+	}
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( &src ) };
+
+		Launcher launcher( data->m_device, data->m_fillInt4Kernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer, constBuffer );
+		launcher.launch1D( n );
+	}
+}
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillHost.inl
@@ -0,0 +1,99 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+template<>
+class Fill<TYPE_HOST>
+{
+	public:
+		struct Data
+		{
+		};
+
+		static
+		Data* allocate(const Device* deviceData)
+		{
+			return 0;
+		}
+
+		static
+		void deallocate(Data* data)
+		{
+
+		}
+
+		template<typename T>
+		static
+		void executeImpl(Data* data, Buffer<T>& src, const T& value, int n, int offset = 0)
+		{
+			ADLASSERT( src.getType() == TYPE_HOST );
+			ADLASSERT( src.m_size >= offset+n );
+			HostBuffer<T>& hSrc = (HostBuffer<T>&)src;
+
+			for(int idx=offset; idx<offset+n; idx++)
+			{
+				hSrc[idx] = value;
+			}
+		}
+
+		static
+		void execute(Data* data, Buffer<int>& src, const int& value, int n, int offset = 0)
+		{
+			executeImpl( data, src, value, n, offset );
+		}
+
+		static
+		void execute(Data* data, Buffer<int2>& src, const int2& value, int n, int offset = 0)
+		{
+			executeImpl( data, src, value, n, offset );
+		}
+
+		static
+		void execute(Data* data, Buffer<int4>& src, const int4& value, int n, int offset = 0)
+		{
+			executeImpl( data, src, value, n, offset );
+		}
+
+/*
+		static
+		void execute(Data* data, Buffer<int>& src, int value, int n, int offset = 0)
+		{
+			ADLASSERT( src.getType() == TYPE_HOST );
+			ADLASSERT( src.m_size <= offset+n );
+			HostBuffer<u32>& hSrc = (HostBuffer<u32>&)src;
+
+			for(int idx=offset; idx<offset+n; idx++)
+			{
+				src[i] = value;
+			}
+		}
+
+		static
+		void execute(Data* data, Buffer<int2>& src, const int2& value, int n, int offset = 0)
+		{
+			ADLASSERT( src.getType() == TYPE_HOST );
+			ADLASSERT( src.m_size <= offset+n );
+
+		}
+
+		static
+		void execute(Data* data, Buffer<int4>& src, const int4& value, int n, int offset = 0)
+		{
+			ADLASSERT( src.getType() == TYPE_HOST );
+			ADLASSERT( src.m_size <= offset+n );
+
+		}
+*/
+};
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernels.cl
@@ -0,0 +1,81 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+
+#define make_uint4 (uint4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+typedef struct
+{
+	int4 m_data;
+	int m_offset;
+	int m_n;
+	int m_padding[2];
+} ConstBuffer;
+
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void FillIntKernel(__global int* dstInt, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		dstInt[ cb.m_offset+gIdx ] = cb.m_data.x;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void FillInt2Kernel(__global int2* dstInt2, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		dstInt2[ cb.m_offset+gIdx ] = make_int2( cb.m_data.x, cb.m_data.y );
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void FillInt4Kernel(__global int4* dstInt4, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		dstInt4[ cb.m_offset+gIdx ] = cb.m_data;
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernels.hlsl
@@ -0,0 +1,79 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+#define GROUP_MEM_FENCE
+#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
+#define AtomInc(x) InterlockedAdd(x, 1)
+#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
+
+#define make_uint4 uint4
+#define make_uint2 uint2
+#define make_int2 int2
+
+
+cbuffer CB : register( b0 )
+{
+	int4 m_data;
+	int m_offset;
+	int m_n;
+	int m_padding[2];
+};
+
+
+RWStructuredBuffer<int> dstInt : register( u0 );
+
+[numthreads(64, 1, 1)]
+void FillIntKernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		dstInt[ m_offset+gIdx ] = m_data.x;
+	}
+}
+
+RWStructuredBuffer<int2> dstInt2 : register( u0 );
+
+[numthreads(64, 1, 1)]
+void FillInt2Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		dstInt2[ m_offset+gIdx ] = make_int2( m_data.x, m_data.y );
+	}
+}
+
+RWStructuredBuffer<int4> dstInt4 : register( u0 );
+
+[numthreads(64, 1, 1)]
+void FillInt4Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		dstInt4[ m_offset+gIdx ] = m_data;
+	}
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernelsCL.h
@@ -0,0 +1,71 @@
+static const char* fillKernelsCL= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"\n"
+"#define make_uint4 (uint4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	int4 m_data;\n"
+"	int m_offset;\n"
+"	int m_n;\n"
+"	int m_padding[2];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void FillIntKernel(__global int* dstInt, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		dstInt[ cb.m_offset+gIdx ] = cb.m_data.x;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void FillInt2Kernel(__global int2* dstInt2, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		dstInt2[ cb.m_offset+gIdx ] = make_int2( cb.m_data.x, cb.m_data.y );\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void FillInt4Kernel(__global int4* dstInt4, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		dstInt4[ cb.m_offset+gIdx ] = cb.m_data;\n"
+"	}\n"
+"}\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernelsDX11.h
@@ -0,0 +1,69 @@
+static const char* fillKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"#define GROUP_MEM_FENCE\n"
+"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+"\n"
+"#define make_uint4 uint4\n"
+"#define make_uint2 uint2\n"
+"#define make_int2 int2\n"
+"\n"
+"\n"
+"cbuffer CB : register( b0 )\n"
+"{\n"
+"	int4 m_data;\n"
+"	int m_offset;\n"
+"	int m_n;\n"
+"	int m_padding[2];\n"
+"};\n"
+"\n"
+"\n"
+"RWStructuredBuffer<int> dstInt : register( u0 );\n"
+"\n"
+"[numthreads(64, 1, 1)]\n"
+"void FillIntKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		dstInt[ m_offset+gIdx ] = m_data.x;\n"
+"	}\n"
+"}\n"
+"\n"
+"RWStructuredBuffer<int2> dstInt2 : register( u0 );\n"
+"\n"
+"[numthreads(64, 1, 1)]\n"
+"void FillInt2Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		dstInt2[ m_offset+gIdx ] = make_int2( m_data.x, m_data.y );\n"
+"	}\n"
+"}\n"
+"\n"
+"RWStructuredBuffer<int4> dstInt4 : register( u0 );\n"
+"\n"
+"[numthreads(64, 1, 1)]\n"
+"void FillInt4Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		dstInt4[ m_offset+gIdx ] = m_data;\n"
+"	}\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Array.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Array.h
@@ -0,0 +1,231 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#ifndef ARRAY_H
+#define ARRAY_H
+
+#include <string.h>
+#include <malloc.h>
+#include <Common/Base/Error.h>
+#include <new.h>
+
+namespace adl
+{
+
+template <class T>
+class Array
+{
+	public:
+		__inline
+		Array();
+		__inline
+		Array(int size);
+		__inline
+		~Array();
+		__inline
+		T& operator[] (int idx);
+		__inline
+		const T& operator[] (int idx) const;
+		__inline
+		void pushBack(const T& elem);
+		__inline
+		void popBack();
+		__inline
+		void clear();
+		__inline
+		void setSize(int size);
+		__inline
+		int getSize() const;
+		__inline
+		T* begin();
+		__inline
+		const T* begin() const;
+		__inline
+		T* end();
+		__inline
+		const T* end() const;
+		__inline
+		int indexOf(const T& data) const;
+		__inline
+		void removeAt(int idx);
+		__inline
+		T& expandOne();
+
+	private:
+		Array(const Array& a){}
+
+	private:
+		enum
+		{
+			DEFAULT_SIZE = 128,
+			INCREASE_SIZE = 128,
+		};
+
+		T* m_data;
+		int m_size;
+		int m_capacity;
+};
+
+template<class T>
+Array<T>::Array()
+{
+	m_size = 0;
+	m_capacity = DEFAULT_SIZE;
+//	m_data = new T[ m_capacity ];
+	m_data = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+	for(int i=0; i<m_capacity; i++) new(&m_data[i])T;
+}
+
+template<class T>
+Array<T>::Array(int size)
+{
+	m_size = size;
+	m_capacity = size;
+//	m_data = new T[ m_capacity ];
+	m_data = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+	for(int i=0; i<m_capacity; i++) new(&m_data[i])T;
+}
+
+template<class T>
+Array<T>::~Array()
+{
+	if( m_data )
+	{
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = NULL;
+	}
+}
+
+template<class T>
+T& Array<T>::operator[](int idx)
+{
+	ADLASSERT(idx<m_size);
+	return m_data[idx];
+}
+
+template<class T>
+const T& Array<T>::operator[](int idx) const
+{
+	ADLASSERT(idx<m_size);
+	return m_data[idx];
+}
+
+template<class T>
+void Array<T>::pushBack(const T& elem)
+{
+	if( m_size == m_capacity )
+	{
+		int oldCap = m_capacity;
+		m_capacity += INCREASE_SIZE;
+//		T* s = new T[m_capacity];
+		T* s = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+		memcpy( s, m_data, sizeof(T)*oldCap );
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = s;
+	}
+	m_data[ m_size++ ] = elem;
+}
+
+template<class T>
+void Array<T>::popBack()
+{
+	ADLASSERT( m_size>0 );
+	m_size--;
+}
+
+template<class T>
+void Array<T>::clear()
+{
+	m_size = 0;
+}
+
+template<class T>
+void Array<T>::setSize(int size)
+{
+	if( size > m_capacity )
+	{
+		int oldCap = m_capacity;
+		m_capacity = size;
+//		T* s = new T[m_capacity];
+		T* s = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+		for(int i=0; i<m_capacity; i++) new(&s[i])T;
+		memcpy( s, m_data, sizeof(T)*oldCap );
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = s;
+	}
+	m_size = size;
+}
+
+template<class T>
+int Array<T>::getSize() const
+{
+	return m_size;
+}
+
+template<class T>
+const T* Array<T>::begin() const
+{
+	return m_data;
+}
+
+template<class T>
+T* Array<T>::begin()
+{
+	return m_data;
+}
+
+template<class T>
+T* Array<T>::end()
+{
+	return m_data+m_size;
+}
+
+template<class T>
+const T* Array<T>::end() const
+{
+	return m_data+m_size;
+}
+
+template<class T>
+int Array<T>::indexOf(const T& data) const
+{
+	for(int i=0; i<m_size; i++)
+	{
+		if( data == m_data[i] ) return i;
+	}
+	return -1;
+}
+
+template<class T>
+void Array<T>::removeAt(int idx)
+{
+	ADLASSERT(idx<m_size);
+	m_data[idx] = m_data[--m_size];
+}
+
+template<class T>
+T& Array<T>::expandOne()
+{
+	setSize( m_size+1 );
+	return m_data[ m_size-1 ];
+}
+
+};
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Float2.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Float2.inl
@@ -0,0 +1,173 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+__inline
+float2 make_float2(float x, float y)
+{
+	float2 v;
+	v.s[0] = x; v.s[1] = y;
+	return v;
+}
+
+__inline
+float2 make_float2(float x)
+{
+	return make_float2(x,x);
+}
+
+__inline
+float2 make_float2(const int2& x)
+{
+	return make_float2((float)x.s[0], (float)x.s[1]);
+}
+
+
+
+
+__inline
+float2 operator-(const float2& a)
+{
+	return make_float2(-a.x, -a.y);
+}
+
+__inline
+float2 operator*(const float2& a, const float2& b)
+{
+	float2 out;
+	out.s[0] = a.s[0]*b.s[0];
+	out.s[1] = a.s[1]*b.s[1];
+	return out;
+}
+
+__inline
+float2 operator*(float a, const float2& b)
+{
+	return make_float2(a*b.s[0], a*b.s[1]);
+}
+
+__inline
+float2 operator*(const float2& b, float a)
+{
+	return make_float2(a*b.s[0], a*b.s[1]);
+}
+
+__inline
+void operator*=(float2& a, const float2& b)
+{
+	a.s[0]*=b.s[0];
+	a.s[1]*=b.s[1];
+}
+
+__inline
+void operator*=(float2& a, float b)
+{
+	a.s[0]*=b;
+	a.s[1]*=b;
+}
+
+__inline
+float2 operator/(const float2& a, const float2& b)
+{
+	float2 out;
+	out.s[0] = a.s[0]/b.s[0];
+	out.s[1] = a.s[1]/b.s[1];
+	return out;
+}
+
+__inline
+float2 operator/(const float2& b, float a)
+{
+	return make_float2(b.s[0]/a, b.s[1]/a);
+}
+
+__inline
+void operator/=(float2& a, const float2& b)
+{
+	a.s[0]/=b.s[0];
+	a.s[1]/=b.s[1];
+}
+
+__inline
+void operator/=(float2& a, float b)
+{
+	a.s[0]/=b;
+	a.s[1]/=b;
+}
+//
+
+__inline
+float2 operator+(const float2& a, const float2& b)
+{
+	float2 out;
+	out.s[0] = a.s[0]+b.s[0];
+	out.s[1] = a.s[1]+b.s[1];
+	return out;
+}
+
+__inline
+float2 operator+(const float2& a, float b)
+{
+	float2 out;
+	out.s[0] = a.s[0]+b;
+	out.s[1] = a.s[1]+b;
+	return out;
+}
+
+__inline
+float2 operator-(const float2& a, const float2& b)
+{
+	float2 out;
+	out.s[0] = a.s[0]-b.s[0];
+	out.s[1] = a.s[1]-b.s[1];
+	return out;
+}
+
+__inline
+float2 operator-(const float2& a, float b)
+{
+	float2 out;
+	out.s[0] = a.s[0]-b;
+	out.s[1] = a.s[1]-b;
+	return out;
+}
+
+__inline
+void operator+=(float2& a, const float2& b)
+{
+	a.s[0]+=b.s[0];
+	a.s[1]+=b.s[1];
+}
+
+__inline
+void operator+=(float2& a, float b)
+{
+	a.s[0]+=b;
+	a.s[1]+=b;
+}
+
+__inline
+void operator-=(float2& a, const float2& b)
+{
+	a.s[0]-=b.s[0];
+	a.s[1]-=b.s[1];
+}
+
+__inline
+void operator-=(float2& a, float b)
+{
+	a.s[0]-=b;
+	a.s[1]-=b;
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Float4.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Float4.inl
@@ -0,0 +1,375 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+//#define CHECK_ALIGNMENT(a) ADLASSERT((u32(&(a)) & 0xf) == 0);
+#define CHECK_ALIGNMENT(a) a;
+
+
+__inline
+float4 make_float4(float x, float y, float z, float w = 0.f)
+{
+	float4 v;
+	v.x = x; v.y = y; v.z = z; v.w = w;
+	return v;
+}
+
+__inline
+float4 make_float4(float x)
+{
+	return make_float4(x,x,x,x);
+}
+
+__inline
+float4 make_float4(const int4& x)
+{
+	return make_float4((float)x.s[0], (float)x.s[1], (float)x.s[2], (float)x.s[3]);
+}
+
+__inline
+int4 make_int4(int x, int y, int z, int w = 0)
+{
+	int4 v;
+	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	return v;
+}
+
+__inline
+int4 make_int4(int x)
+{
+	return make_int4(x,x,x,x);
+}
+
+__inline
+int4 make_int4(const float4& x)
+{
+	return make_int4((int)x.x, (int)x.y, (int)x.z, (int)x.w);
+}
+
+__inline
+int2 make_int2(int a, int b)
+{
+	int2 ans; ans.x = a; ans.y = b;
+	return ans;
+}
+
+__inline
+bool operator ==(const int2& a, const int2& b)
+{
+	return a.x==b.x && a.y==b.y;
+}
+
+__inline
+bool operator ==(const int4& a, const int4& b)
+{
+	return a.x==b.x && a.y==b.y && a.z==b.z && a.w==b.w;
+}
+
+__inline
+bool operator ==(const float2& a, const float2& b)
+{
+	return a.x==b.x && a.y==b.y;
+}
+
+__inline
+bool operator ==(const float4& a, const float4& b)
+{
+	return a.x==b.x && a.y==b.y && a.z==b.z && a.w==b.w;
+}
+
+__inline
+float4 operator-(const float4& a)
+{
+	return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+
+__inline
+float4 operator*(const float4& a, const float4& b)
+{
+//	ADLASSERT((u32(&a) & 0xf) == 0);
+
+	float4 out;
+	out.s[0] = a.s[0]*b.s[0];
+	out.s[1] = a.s[1]*b.s[1];
+	out.s[2] = a.s[2]*b.s[2];
+	out.s[3] = a.s[3]*b.s[3];
+	return out;
+}
+
+__inline
+float4 operator*(float a, const float4& b)
+{
+	return make_float4(a*b.s[0], a*b.s[1], a*b.s[2], a*b.s[3]);
+}
+
+__inline
+float4 operator*(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	return make_float4(a*b.s[0], a*b.s[1], a*b.s[2], a*b.s[3]);
+}
+
+__inline
+void operator*=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]*=b.s[0];
+	a.s[1]*=b.s[1];
+	a.s[2]*=b.s[2];
+	a.s[3]*=b.s[3];
+}
+
+__inline
+void operator*=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]*=b;
+	a.s[1]*=b;
+	a.s[2]*=b;
+	a.s[3]*=b;
+}
+/*
+__inline
+bool operator ==(const float4& a, const float4& b)
+{
+
+
+}
+*/
+//
+__inline
+float4 operator/(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]/b.s[0];
+	out.s[1] = a.s[1]/b.s[1];
+	out.s[2] = a.s[2]/b.s[2];
+	out.s[3] = a.s[3]/b.s[3];
+	return out;
+}
+
+__inline
+float4 operator/(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	return make_float4(b.s[0]/a, b.s[1]/a, b.s[2]/a, b.s[3]/a);
+}
+
+__inline
+void operator/=(float4& a, const float4& b)
+{
+	a.s[0]/=b.s[0];
+	a.s[1]/=b.s[1];
+	a.s[2]/=b.s[2];
+	a.s[3]/=b.s[3];
+}
+
+__inline
+void operator/=(float4& a, float b)
+{
+	ADLASSERT((u32(&a) & 0xf) == 0);
+
+	a.s[0]/=b;
+	a.s[1]/=b;
+	a.s[2]/=b;
+	a.s[3]/=b;
+}
+//
+
+__inline
+float4 operator+(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]+b.s[0];
+	out.s[1] = a.s[1]+b.s[1];
+	out.s[2] = a.s[2]+b.s[2];
+	out.s[3] = a.s[3]+b.s[3];
+	return out;
+}
+
+__inline
+float4 operator+(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]+b;
+	out.s[1] = a.s[1]+b;
+	out.s[2] = a.s[2]+b;
+	out.s[3] = a.s[3]+b;
+	return out;
+}
+
+__inline
+float4 operator-(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]-b.s[0];
+	out.s[1] = a.s[1]-b.s[1];
+	out.s[2] = a.s[2]-b.s[2];
+	out.s[3] = a.s[3]-b.s[3];
+	return out;
+}
+
+__inline
+float4 operator-(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]-b;
+	out.s[1] = a.s[1]-b;
+	out.s[2] = a.s[2]-b;
+	out.s[3] = a.s[3]-b;
+	return out;
+}
+
+__inline
+void operator+=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]+=b.s[0];
+	a.s[1]+=b.s[1];
+	a.s[2]+=b.s[2];
+	a.s[3]+=b.s[3];
+}
+
+__inline
+void operator+=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]+=b;
+	a.s[1]+=b;
+	a.s[2]+=b;
+	a.s[3]+=b;
+}
+
+__inline
+void operator-=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]-=b.s[0];
+	a.s[1]-=b.s[1];
+	a.s[2]-=b.s[2];
+	a.s[3]-=b.s[3];
+}
+
+__inline
+void operator-=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]-=b;
+	a.s[1]-=b;
+	a.s[2]-=b;
+	a.s[3]-=b;
+}
+
+
+
+
+
+__inline
+float4 cross3(const float4& a, const float4& b)
+{
+	return make_float4(a.s[1]*b.s[2]-a.s[2]*b.s[1], 
+		a.s[2]*b.s[0]-a.s[0]*b.s[2], 
+		a.s[0]*b.s[1]-a.s[1]*b.s[0], 
+		0);
+}
+
+__inline
+float dot3F4(const float4& a, const float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z;
+}
+
+__inline
+float length3(const float4& a)
+{
+	return sqrtf(dot3F4(a,a));
+}
+
+__inline
+float dot4(const float4& a, const float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
+}
+
+//	for height
+__inline
+float dot3w1(const float4& point, const float4& eqn)
+{
+	return point.x*eqn.x+point.y*eqn.y+point.z*eqn.z+eqn.w;
+}
+
+__inline
+float4 normalize3(const float4& a)
+{
+	float length = sqrtf(dot3F4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 normalize4(const float4& a)
+{
+	float length = sqrtf(dot4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 createEquation(const float4& a, const float4& b, const float4& c)
+{
+	float4 eqn;
+	float4 ab = b-a;
+	float4 ac = c-a;
+	eqn = normalize3( cross3(ab, ac) );
+	eqn.w = -dot3F4(eqn,a);
+	return eqn;
+}
+
+__inline
+float intersectPlaneLine( const float4& planeEqn, const float4& vec, const float4& orig )
+{
+	return (-planeEqn.w - dot3F4(planeEqn, orig))/dot3F4(planeEqn, vec);
+}
+
+template<>
+__inline
+float4 max2(const float4& a, const float4& b)
+{
+	return make_float4( max2(a.x,b.x), max2(a.y,b.y), max2(a.z,b.z), max2(a.w,b.w) );
+}
+
+template<>
+__inline
+float4 min2(const float4& a, const float4& b)
+{
+	return make_float4( min2(a.x,b.x), min2(a.y,b.y), min2(a.z,b.z), min2(a.w,b.w) );
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Math.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Math.h
@@ -0,0 +1,224 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef CL_MATH_H
+#define CL_MATH_H
+
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <xmmintrin.h>
+
+
+#include <Adl/Adl.h>
+
+#include <algorithm>
+#define pxSort std::sort
+
+#define PI       3.14159265358979323846f
+#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
+
+
+#define _MEM_CLASSALIGN16 __declspec(align(16))
+#define _MEM_ALIGNED_ALLOCATOR16 	void* operator new(size_t size) { return _aligned_malloc( size, 16 ); } \
+	void operator delete(void *p) { _aligned_free( p ); } \
+	void* operator new[](size_t size) { return _aligned_malloc( size, 16 ); } \
+	void operator delete[](void *p) { _aligned_free( p ); } \
+	void* operator new(size_t size, void* p) { return p; } \
+	void operator delete(void *p, void* pp) {} 
+
+namespace adl
+{
+
+template<class T>
+T nextPowerOf2(T n)
+{
+	n -= 1;
+	for(int i=0; i<sizeof(T)*8; i++)
+		n = n | (n>>i);
+	return n+1;
+}
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+_MEM_CLASSALIGN16
+struct float4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	union
+	{
+		struct
+		{
+			float x,y,z,w;
+		};
+		struct
+		{
+			float s[4];
+		};
+		__m128 m_quad;
+	};
+};
+
+_MEM_CLASSALIGN16
+struct int4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	union
+	{
+		struct
+		{
+			int x,y,z,w;
+		};
+		struct
+		{
+			int s[4];
+		};
+	};
+};
+
+_MEM_CLASSALIGN16
+struct uint4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	union
+	{
+		struct
+		{
+			u32 x,y,z,w;
+		};
+		struct
+		{
+			u32 s[4];
+		};
+	};
+};
+
+struct int2
+{
+	union
+	{
+		struct
+		{
+			int x,y;
+		};
+		struct
+		{
+			int s[2];
+		};
+	};
+};
+
+struct float2
+{
+	union
+	{
+		struct
+		{
+			float x,y;
+		};
+		struct
+		{
+			float s[2];
+		};
+	};
+};
+
+template<typename T>
+__inline
+T max2(const T& a, const T& b)
+{
+	return (a>b)? a:b;
+}
+
+template<typename T>
+__inline
+T min2(const T& a, const T& b)
+{
+	return (a<b)? a:b;
+}
+
+
+#include <AdlPrimitives/Math/Float4.inl>
+#include <AdlPrimitives/Math/Float2.inl>
+
+
+template<typename T>
+void swap2(T& a, T& b)
+{
+	T tmp = a;
+	a = b;
+	b = tmp;
+}
+
+
+__inline
+void seedRandom(int seed)
+{
+	srand( seed );
+}
+
+template<typename T>
+__inline
+T getRandom(const T& minV, const T& maxV)
+{
+	float r = (rand()%10000)/10000.f;
+	T range = maxV - minV;
+	return (T)(minV + r*range);
+}
+
+template<>
+__inline
+float4 getRandom(const float4& minV, const float4& maxV)
+{
+	float4 r = make_float4( (rand()%10000)/10000.f, (rand()%10000)/10000.f, (rand()%10000)/10000.f, (rand()%10000)/10000.f );
+	float4 range = maxV - minV;
+	return (minV + r*range);
+}
+
+
+
+template<typename T>
+T* addByteOffset(void* baseAddr, u32 offset)
+{
+	return (T*)(((u32)baseAddr)+offset);
+}
+
+
+struct Pair32
+{
+	Pair32(){}
+	Pair32(u32 a, u32 b) : m_a(a), m_b(b){}
+
+	u32 m_a;
+	u32 m_b;
+};
+
+struct PtrPair
+{
+	PtrPair(){}
+	PtrPair(void* a, void* b) : m_a(a), m_b(b){}
+	template<typename T>
+	PtrPair(T* a, T* b) : m_a((void*)a), m_b((void*)b){}
+
+	void* m_a;
+	void* m_b;
+};
+
+};
+
+#endif
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/MathCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/MathCL.h
@@ -0,0 +1,357 @@
+
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+
+
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+
+#define max2 max
+#define min2 min
+
+
+///////////////////////////////////////
+//	Vector
+///////////////////////////////////////
+__inline
+float fastDiv(float numerator, float denominator)
+{
+	return native_divide(numerator, denominator);	
+//	return numerator/denominator;	
+}
+
+__inline
+float4 fastDiv4(float4 numerator, float4 denominator)
+{
+	return native_divide(numerator, denominator);	
+}
+
+__inline
+float fastSqrtf(float f2)
+{
+	return native_sqrt(f2);
+//	return sqrt(f2);
+}
+
+__inline
+float fastRSqrt(float f2)
+{
+	return native_rsqrt(f2);
+}
+
+__inline
+float fastLength4(float4 v)
+{
+	return fast_length(v);
+}
+
+__inline
+float4 fastNormalize4(float4 v)
+{
+	return fast_normalize(v);
+}
+
+
+__inline
+float sqrtf(float a)
+{
+//	return sqrt(a);
+	return native_sqrt(a);
+}
+
+__inline
+float4 cross3(float4 a, float4 b)
+{
+	return cross(a,b);
+}
+
+__inline
+float dot3F4(float4 a, float4 b)
+{
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+}
+
+__inline
+float length3(const float4 a)
+{
+	return sqrtf(dot3F4(a,a));
+}
+
+__inline
+float dot4(const float4 a, const float4 b)
+{
+	return dot( a, b );
+}
+
+//	for height
+__inline
+float dot3w1(const float4 point, const float4 eqn)
+{
+	return dot3F4(point,eqn) + eqn.w;
+}
+
+__inline
+float4 normalize3(const float4 a)
+{
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+//	float length = sqrtf(dot3F4(a, a));
+//	return 1.f/length * a;
+}
+
+__inline
+float4 normalize4(const float4 a)
+{
+	float length = sqrtf(dot4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 createEquation(const float4 a, const float4 b, const float4 c)
+{
+	float4 eqn;
+	float4 ab = b-a;
+	float4 ac = c-a;
+	eqn = normalize3( cross3(ab, ac) );
+	eqn.w = -dot3F4(eqn,a);
+	return eqn;
+}
+
+///////////////////////////////////////
+//	Matrix3x3
+///////////////////////////////////////
+
+typedef struct
+{
+	float4 m_row[3];
+}Matrix3x3;
+
+__inline
+Matrix3x3 mtZero();
+
+__inline
+Matrix3x3 mtIdentity();
+
+__inline
+Matrix3x3 mtTranspose(Matrix3x3 m);
+
+__inline
+Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);
+
+__inline
+float4 mtMul1(Matrix3x3 a, float4 b);
+
+__inline
+float4 mtMul3(float4 a, Matrix3x3 b);
+
+__inline
+Matrix3x3 mtZero()
+{
+	Matrix3x3 m;
+	m.m_row[0] = (float4)(0.f);
+	m.m_row[1] = (float4)(0.f);
+	m.m_row[2] = (float4)(0.f);
+	return m;
+}
+
+__inline
+Matrix3x3 mtIdentity()
+{
+	Matrix3x3 m;
+	m.m_row[0] = (float4)(1,0,0,0);
+	m.m_row[1] = (float4)(0,1,0,0);
+	m.m_row[2] = (float4)(0,0,1,0);
+	return m;
+}
+
+__inline
+Matrix3x3 mtTranspose(Matrix3x3 m)
+{
+	Matrix3x3 out;
+	out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
+	out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
+	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
+	return out;
+}
+
+__inline
+Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
+{
+	Matrix3x3 transB;
+	transB = mtTranspose( b );
+	Matrix3x3 ans;
+	//	why this doesn't run when 0ing in the for{}
+	a.m_row[0].w = 0.f;
+	a.m_row[1].w = 0.f;
+	a.m_row[2].w = 0.f;
+	for(int i=0; i<3; i++)
+	{
+//	a.m_row[i].w = 0.f;
+		ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
+		ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
+		ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
+		ans.m_row[i].w = 0.f;
+	}
+	return ans;
+}
+
+__inline
+float4 mtMul1(Matrix3x3 a, float4 b)
+{
+	float4 ans;
+	ans.x = dot3F4( a.m_row[0], b );
+	ans.y = dot3F4( a.m_row[1], b );
+	ans.z = dot3F4( a.m_row[2], b );
+	ans.w = 0.f;
+	return ans;
+}
+
+__inline
+float4 mtMul3(float4 a, Matrix3x3 b)
+{
+	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
+	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
+	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
+
+	float4 ans;
+	ans.x = dot3F4( a, colx );
+	ans.y = dot3F4( a, coly );
+	ans.z = dot3F4( a, colz );
+	return ans;
+}
+
+///////////////////////////////////////
+//	Quaternion
+///////////////////////////////////////
+
+typedef float4 Quaternion;
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b);
+
+__inline
+Quaternion qtNormalize(Quaternion in);
+
+__inline
+float4 qtRotate(Quaternion q, float4 vec);
+
+__inline
+Quaternion qtInvert(Quaternion q);
+
+__inline
+Matrix3x3 qtGetRotationMatrix(Quaternion q);
+
+
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+}
+
+__inline
+Quaternion qtNormalize(Quaternion in)
+{
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+}
+__inline
+float4 qtRotate(Quaternion q, float4 vec)
+{
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+}
+
+__inline
+Quaternion qtInvert(Quaternion q)
+{
+	return (Quaternion)(-q.xyz, q.w);
+}
+
+__inline
+float4 qtInvRotate(const Quaternion q, float4 vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+__inline
+Matrix3x3 qtGetRotationMatrix(Quaternion quat)
+{
+	float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);
+	Matrix3x3 out;
+
+	out.m_row[0].x=1-2*quat2.y-2*quat2.z;
+	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;
+	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;
+	out.m_row[0].w = 0.f;
+
+	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;
+	out.m_row[1].y=1-2*quat2.x-2*quat2.z;
+	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;
+	out.m_row[1].w = 0.f;
+
+	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;
+	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;
+	out.m_row[2].z=1-2*quat2.x-2*quat2.y;
+	out.m_row[2].w = 0.f;
+
+	return out;
+}
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Matrix3x3.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Matrix3x3.h
@@ -0,0 +1,197 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#ifndef MATRIX3X3_H
+#define MATRIX3X3_H
+
+#include <AdlPrimitives/Math/Math.h>
+
+///////////////////////////////////////
+//	Matrix3x3
+///////////////////////////////////////
+namespace adl
+{
+
+typedef 
+_MEM_CLASSALIGN16 struct
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	float4 m_row[3];
+}Matrix3x3;
+
+__inline
+Matrix3x3 mtZero();
+
+__inline
+Matrix3x3 mtIdentity();
+
+__inline
+Matrix3x3 mtDiagonal(float a, float b, float c);
+
+__inline
+Matrix3x3 mtTranspose(const Matrix3x3& m);
+
+__inline
+Matrix3x3 mtMul(const Matrix3x3& a, const Matrix3x3& b);
+
+__inline
+float4 mtMul1(const Matrix3x3& a, const float4& b);
+
+__inline
+Matrix3x3 mtMul2(float a, const Matrix3x3& b);
+
+__inline
+float4 mtMul3(const float4& b, const Matrix3x3& a);
+
+__inline
+Matrix3x3 mtInvert(const Matrix3x3& m);
+
+__inline
+Matrix3x3 mtZero()
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(0.f);
+	m.m_row[1] = make_float4(0.f);
+	m.m_row[2] = make_float4(0.f);
+	return m;
+}
+
+__inline
+Matrix3x3 mtIdentity()
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(1,0,0);
+	m.m_row[1] = make_float4(0,1,0);
+	m.m_row[2] = make_float4(0,0,1);
+	return m;
+}
+
+__inline
+Matrix3x3 mtDiagonal(float a, float b, float c)
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(a,0,0);
+	m.m_row[1] = make_float4(0,b,0);
+	m.m_row[2] = make_float4(0,0,c);
+	return m;
+}
+
+__inline
+Matrix3x3 mtTranspose(const Matrix3x3& m)
+{
+	Matrix3x3 out;
+	out.m_row[0] = make_float4(m.m_row[0].s[0], m.m_row[1].s[0], m.m_row[2].s[0], 0.f);
+	out.m_row[1] = make_float4(m.m_row[0].s[1], m.m_row[1].s[1], m.m_row[2].s[1], 0.f);
+	out.m_row[2] = make_float4(m.m_row[0].s[2], m.m_row[1].s[2], m.m_row[2].s[2], 0.f);
+	return out;
+}
+
+__inline
+Matrix3x3 mtMul(const Matrix3x3& a, const Matrix3x3& b)
+{
+	Matrix3x3 transB;
+	transB = mtTranspose( b );
+	Matrix3x3 ans;
+	for(int i=0; i<3; i++)
+	{
+		ans.m_row[i].s[0] = dot3F4(a.m_row[i],transB.m_row[0]);
+		ans.m_row[i].s[1] = dot3F4(a.m_row[i],transB.m_row[1]);
+		ans.m_row[i].s[2] = dot3F4(a.m_row[i],transB.m_row[2]);
+	}
+	return ans;
+}
+
+__inline
+float4 mtMul1(const Matrix3x3& a, const float4& b)
+{
+	float4 ans;
+	ans.s[0] = dot3F4( a.m_row[0], b );
+	ans.s[1] = dot3F4( a.m_row[1], b );
+	ans.s[2] = dot3F4( a.m_row[2], b );
+	return ans;
+}
+
+__inline
+Matrix3x3 mtMul2(float a, const Matrix3x3& b)
+{
+	Matrix3x3 ans;
+	ans.m_row[0] = a*b.m_row[0];
+	ans.m_row[1] = a*b.m_row[1];
+	ans.m_row[2] = a*b.m_row[2];
+	return ans;
+}
+
+__inline
+float4 mtMul3(const float4& a, const Matrix3x3& b)
+{
+	float4 ans;
+	ans.x = a.x*b.m_row[0].x + a.y*b.m_row[1].x + a.z*b.m_row[2].x;
+	ans.y = a.x*b.m_row[0].y + a.y*b.m_row[1].y + a.z*b.m_row[2].y;
+	ans.z = a.x*b.m_row[0].z + a.y*b.m_row[1].z + a.z*b.m_row[2].z;
+	return ans;
+}
+
+__inline
+Matrix3x3 mtInvert(const Matrix3x3& m)
+{
+	float det = m.m_row[0].s[0]*m.m_row[1].s[1]*m.m_row[2].s[2]+m.m_row[1].s[0]*m.m_row[2].s[1]*m.m_row[0].s[2]+m.m_row[2].s[0]*m.m_row[0].s[1]*m.m_row[1].s[2]
+	-m.m_row[0].s[0]*m.m_row[2].s[1]*m.m_row[1].s[2]-m.m_row[2].s[0]*m.m_row[1].s[1]*m.m_row[0].s[2]-m.m_row[1].s[0]*m.m_row[0].s[1]*m.m_row[2].s[2];
+
+	ADLASSERT( det );
+
+	Matrix3x3 ans;
+	ans.m_row[0].s[0] = m.m_row[1].s[1]*m.m_row[2].s[2] - m.m_row[1].s[2]*m.m_row[2].s[1];
+	ans.m_row[0].s[1] = m.m_row[0].s[2]*m.m_row[2].s[1] - m.m_row[0].s[1]*m.m_row[2].s[2];
+	ans.m_row[0].s[2] = m.m_row[0].s[1]*m.m_row[1].s[2] - m.m_row[0].s[2]*m.m_row[1].s[1];
+	ans.m_row[0].w = 0.f;
+
+	ans.m_row[1].s[0] = m.m_row[1].s[2]*m.m_row[2].s[0] - m.m_row[1].s[0]*m.m_row[2].s[2];
+	ans.m_row[1].s[1] = m.m_row[0].s[0]*m.m_row[2].s[2] - m.m_row[0].s[2]*m.m_row[2].s[0];
+	ans.m_row[1].s[2] = m.m_row[0].s[2]*m.m_row[1].s[0] - m.m_row[0].s[0]*m.m_row[1].s[2];
+	ans.m_row[1].w = 0.f;
+
+	ans.m_row[2].s[0] = m.m_row[1].s[0]*m.m_row[2].s[1] - m.m_row[1].s[1]*m.m_row[2].s[0];
+	ans.m_row[2].s[1] = m.m_row[0].s[1]*m.m_row[2].s[0] - m.m_row[0].s[0]*m.m_row[2].s[1];
+	ans.m_row[2].s[2] = m.m_row[0].s[0]*m.m_row[1].s[1] - m.m_row[0].s[1]*m.m_row[1].s[0];
+	ans.m_row[2].w = 0.f;
+
+	ans = mtMul2((1.0f/det), ans);
+	return ans;
+}
+
+__inline
+Matrix3x3 mtSet( const float4& a, const float4& b, const float4& c )
+{
+	Matrix3x3 m;
+	m.m_row[0] = a;
+	m.m_row[1] = b;
+	m.m_row[2] = c;
+	return m;
+}
+
+__inline
+Matrix3x3 operator+(const Matrix3x3& a, const Matrix3x3& b)
+{
+	Matrix3x3 out;
+	out.m_row[0] = a.m_row[0] + b.m_row[0];
+	out.m_row[1] = a.m_row[1] + b.m_row[1];
+	out.m_row[2] = a.m_row[2] + b.m_row[2];
+	return out;
+}
+
+};
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Quaternion.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Quaternion.h
@@ -0,0 +1,159 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef QUATERNION_H
+#define QUATERNION_H
+
+#include <AdlPrimitives/Math/Matrix3x3.h>
+
+namespace adl
+{
+
+typedef float4 Quaternion;
+
+__inline
+Quaternion qtSet(const float4& axis, float angle);
+
+__inline
+Quaternion qtMul(const Quaternion& a, const Quaternion& b);
+
+__inline
+float4 qtRotate(const Quaternion& q, const float4& vec);
+
+__inline
+float4 qtInvRotate(const Quaternion& q, const float4& vec);
+
+__inline
+Quaternion qtInvert(const Quaternion& q);
+
+__inline
+Matrix3x3 qtGetRotationMatrix(const Quaternion& quat);
+
+__inline
+Quaternion qtNormalize(const Quaternion& q);
+
+__inline
+Quaternion qtGetIdentity() { return make_float4(0,0,0,1); }
+
+__inline
+Quaternion qtSet(const float4& axis, float angle)
+{
+	float4 nAxis = normalize3( axis );
+
+	Quaternion q;
+	q.s[0] = nAxis.s[0]*sin(angle/2);
+	q.s[1] = nAxis.s[1]*sin(angle/2);
+	q.s[2] = nAxis.s[2]*sin(angle/2);
+	q.s[3] = cos(angle/2);
+	return q;
+}
+
+__inline
+Quaternion qtMul(const Quaternion& a, const Quaternion& b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.s[3]*b + b.s[3]*a;
+	ans.s[3] = a.s[3]*b.s[3] - (a.s[0]*b.s[0]+a.s[1]*b.s[1]+a.s[2]*b.s[2]);
+	return ans;
+}
+
+__inline
+float4 qtRotate(const Quaternion& q, const float4& vec)
+{
+	Quaternion vecQ = vec;
+	vecQ.s[3] = 0.f;
+	Quaternion qInv = qtInvert( q );
+	float4 out = qtMul(qtMul(q,vecQ),qInv);
+	return out;
+}
+
+__inline
+float4 qtInvRotate(const Quaternion& q, const float4& vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+__inline
+Quaternion qtInvert(const Quaternion& q)
+{
+	Quaternion ans;
+	ans.s[0] = -q.s[0];
+	ans.s[1] = -q.s[1];
+	ans.s[2] = -q.s[2];
+	ans.s[3] = q.s[3];
+	return ans;
+}
+
+__inline
+Matrix3x3 qtGetRotationMatrix(const Quaternion& quat)
+{
+	float4 quat2 = make_float4(quat.s[0]*quat.s[0], quat.s[1]*quat.s[1], quat.s[2]*quat.s[2], 0.f);
+	Matrix3x3 out;
+
+	out.m_row[0].s[0]=1-2*quat2.s[1]-2*quat2.s[2];
+	out.m_row[0].s[1]=2*quat.s[0]*quat.s[1]-2*quat.s[3]*quat.s[2];
+	out.m_row[0].s[2]=2*quat.s[0]*quat.s[2]+2*quat.s[3]*quat.s[1];
+	out.m_row[0].s[3] = 0.f;
+
+	out.m_row[1].s[0]=2*quat.s[0]*quat.s[1]+2*quat.s[3]*quat.s[2];
+	out.m_row[1].s[1]=1-2*quat2.s[0]-2*quat2.s[2];
+	out.m_row[1].s[2]=2*quat.s[1]*quat.s[2]-2*quat.s[3]*quat.s[0];
+	out.m_row[1].s[3] = 0.f;
+
+	out.m_row[2].s[0]=2*quat.s[0]*quat.s[2]-2*quat.s[3]*quat.s[1];
+	out.m_row[2].s[1]=2*quat.s[1]*quat.s[2]+2*quat.s[3]*quat.s[0];
+	out.m_row[2].s[2]=1-2*quat2.s[0]-2*quat2.s[1];
+	out.m_row[2].s[3] = 0.f;
+
+	return out;
+}
+
+__inline
+Quaternion qtGetQuaternion(const Matrix3x3* m)
+{
+	Quaternion q;
+	q.w = sqrtf( m[0].m_row[0].x + m[0].m_row[1].y + m[0].m_row[2].z + 1 ) * 0.5f;
+	float inv4w = 1.f/(4.f*q.w);
+	q.x = (m[0].m_row[2].y-m[0].m_row[1].z)*inv4w;
+	q.y = (m[0].m_row[0].z-m[0].m_row[2].x)*inv4w;
+	q.z = (m[0].m_row[1].x-m[0].m_row[0].y)*inv4w;
+
+	return q;
+}
+
+__inline
+Quaternion qtNormalize(const Quaternion& q)
+{
+	return normalize4(q);
+}
+
+__inline
+float4 transform(const float4& p, const float4& translation, const Quaternion& orientation)
+{
+	return qtRotate( orientation, p ) + translation;
+}
+
+__inline
+float4 invTransform(const float4& p, const float4& translation, const Quaternion& orientation)
+{
+	return qtRotate( qtInvert( orientation ), p-translation ); // use qtInvRotate
+}
+
+};
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScan.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScan.h
@@ -0,0 +1,73 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+
+namespace adl
+{
+
+class PrefixScanBase
+{
+	public:
+		enum Option
+		{
+			INCLUSIVE, 
+			EXCLUSIVE
+		};
+};
+
+
+template<DeviceType TYPE>
+class PrefixScan : public PrefixScanBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		enum
+		{
+			BLOCK_SIZE = 128
+		};
+
+		struct Data
+		{
+			Option m_option;
+			const Device* m_device;
+			Kernel* m_localScanKernel;
+			Kernel* m_blockSumKernel;
+			Kernel* m_propagationKernel;
+			Buffer<u32>* m_workBuffer;
+			Buffer<int4>* m_constBuffer[3];// todo. dx need one for each
+			int m_maxSize;
+		};
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize, Option option = EXCLUSIVE);
+
+		static
+		void deallocate(Data* data);
+
+		static
+		void execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, u32* sum = 0);
+};
+
+
+
+#include <AdlPrimitives/Scan/PrefixScanHost.inl>
+#include <AdlPrimitives/Scan/PrefixScan.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScan.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScan.inl
@@ -0,0 +1,125 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Scan\\PrefixScanKernels"
+#define KERNEL0 "LocalScanKernel"
+#define KERNEL1 "TopLevelScanKernel"
+#define KERNEL2 "AddOffsetKernel"
+
+#include <AdlPrimitives/Scan/PrefixScanKernelsCL.h>
+#include <AdlPrimitives/Scan/PrefixScanKernelsDX11.h>
+
+template<DeviceType TYPE>
+typename PrefixScan<TYPE>::Data* PrefixScan<TYPE>::allocate(const Device* device, int maxSize, Option option)
+{
+	ADLASSERT( TYPE == device->m_type );
+
+	ADLASSERT( maxSize <= BLOCK_SIZE*2*2048 );
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{prefixScanKernelsCL, prefixScanKernelsDX11};
+#else
+		{0,0};
+#endif
+	Data* data = new Data;
+	data->m_device = device;
+	data->m_localScanKernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
+	data->m_blockSumKernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
+	data->m_propagationKernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
+
+	int bufSize = (NEXTMULTIPLEOF( max2( maxSize/BLOCK_SIZE, (int)BLOCK_SIZE ), BLOCK_SIZE )+1);
+	data->m_workBuffer = new Buffer<u32>( device, bufSize );
+	data->m_constBuffer[0] = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
+	data->m_constBuffer[1] = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
+	data->m_constBuffer[2] = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
+
+	data->m_maxSize = maxSize;
+	data->m_option = option;
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void PrefixScan<TYPE>::deallocate(Data* data)
+{
+	delete data->m_workBuffer;
+	delete data->m_constBuffer[0];
+	delete data->m_constBuffer[1];
+	delete data->m_constBuffer[2];
+	delete data;
+}
+
+template<DeviceType TYPE>
+void PrefixScan<TYPE>::execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, u32* sum)
+{
+	ADLASSERT( data );
+	ADLASSERT( n <= data->m_maxSize );
+	ADLASSERT( data->m_option == EXCLUSIVE );
+	const u32 numBlocks = u32( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
+
+
+	int4 constBuffer;
+	constBuffer.x = n;
+	constBuffer.y = numBlocks;
+	constBuffer.z = (int)nextPowerOf2( numBlocks );
+
+	Buffer<u32>* srcNative = BufferUtils::map<TYPE, true>( data->m_device, &src );
+	Buffer<u32>* dstNative = BufferUtils::map<TYPE, false>( data->m_device, &dst );
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( dstNative ), BufferInfo( srcNative ), BufferInfo( data->m_workBuffer ) };
+
+		Launcher launcher( data->m_device, data->m_localScanKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer[0], constBuffer );
+		launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
+	}
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( data->m_workBuffer ) };
+
+		Launcher launcher( data->m_device, data->m_blockSumKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer[1], constBuffer );
+		launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
+	}
+	
+
+	if( numBlocks > 1 )
+	{
+		BufferInfo bInfo[] = { BufferInfo( dstNative ), BufferInfo( data->m_workBuffer ) };
+		Launcher launcher( data->m_device, data->m_propagationKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer[2], constBuffer );
+		launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
+	}
+
+	DeviceUtils::waitForCompletion( data->m_device );
+	if( sum )
+	{
+		dstNative->read( sum, 1, n-1);
+	}
+	DeviceUtils::waitForCompletion( data->m_device );
+
+	BufferUtils::unmap<false>( srcNative, &src );
+	BufferUtils::unmap<true>( dstNative, &dst );
+}
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanHost.inl
@@ -0,0 +1,74 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+template<>
+class PrefixScan<TYPE_HOST> : public PrefixScanBase
+{
+	public:
+		struct Data
+		{
+			Option m_option;
+		};
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize, Option option = EXCLUSIVE)
+		{
+			ADLASSERT( deviceData->m_type == TYPE_HOST );
+
+			Data* data = new Data;
+			data->m_option = option;
+			return data;
+		}
+
+		static
+		void deallocate(Data* data)
+		{
+			delete data;
+		}
+
+		static
+		void execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, u32* sum = 0)
+		{
+			ADLASSERT( src.getType() == TYPE_HOST && dst.getType() == TYPE_HOST );
+			HostBuffer<u32>& hSrc = (HostBuffer<u32>&)src;
+			HostBuffer<u32>& hDst = (HostBuffer<u32>&)dst;
+
+			u32 s = 0;
+			if( data->m_option == EXCLUSIVE )
+			{
+				for(int i=0; i<n; i++)
+				{
+					hDst[i] = s;
+					s += hSrc[i];
+				}
+			}
+			else
+			{
+				for(int i=0; i<n; i++)
+				{
+					s += hSrc[i];
+					hDst[i] = s;
+				}
+			}
+
+			if( sum )
+			{
+				*sum = hDst[n-1];
+			}
+		}
+
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernels.cl
@@ -0,0 +1,153 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+
+// takahiro end
+#define WG_SIZE 128
+
+typedef struct
+{
+	uint m_numElems;
+	uint m_numBlocks;
+	uint m_numScanBlocks;
+	uint m_padding[1];
+} ConstBuffer;
+
+
+u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)
+{
+	u32 blocksum;
+    int offset = 1;
+    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
+    {
+        GROUP_LDS_BARRIER;
+        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
+        {
+            int ai = offset*(2*iIdx+1)-1;
+            int bi = offset*(2*iIdx+2)-1;
+            data[bi] += data[ai];
+        }
+	}
+
+    GROUP_LDS_BARRIER;
+
+    if( lIdx == 0 )
+	{
+		blocksum = data[ n-1 ];
+        data[ n-1 ] = 0;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	offset >>= 1;
+    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
+    {
+        GROUP_LDS_BARRIER;
+        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
+        {
+            int ai = offset*(2*iIdx+1)-1;
+            int bi = offset*(2*iIdx+2)-1;
+            u32 temp = data[ai];
+            data[ai] = data[bi];
+            data[bi] += temp;
+        }
+	}
+	GROUP_LDS_BARRIER;
+
+	return blocksum;
+}
+
+
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+__kernel
+void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,
+		ConstBuffer cb)
+{
+	__local u32 ldsData[WG_SIZE*2];
+
+	int gIdx = GET_GLOBAL_IDX;
+	int lIdx = GET_LOCAL_IDX;
+
+	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;
+	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;
+
+	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
+
+	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;
+
+	if( (2*gIdx) < cb.m_numElems )
+    {
+        dst[2*gIdx]     = ldsData[2*lIdx];
+	}
+	if( (2*gIdx + 1) < cb.m_numElems )
+	{
+        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
+    }
+}
+
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+__kernel
+void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, ConstBuffer cb)
+{
+	const u32 blockSize = WG_SIZE*2;
+
+	int myIdx = GET_GROUP_IDX+1;
+	int lIdx = GET_LOCAL_IDX;
+
+	u32 iBlockSum = blockSum[myIdx];
+
+	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);
+	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)
+	{
+		dst[i] += iBlockSum;
+	}
+}
+
+
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+__kernel
+void TopLevelScanKernel(__global u32* dst, ConstBuffer cb)
+{
+	__local u32 ldsData[2048];
+	int gIdx = GET_GLOBAL_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	int lSize = GET_GROUP_SIZE;
+
+	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )
+	{
+		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
+
+	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )
+	{
+		dst[i] = ldsData[i];
+	}
+
+	if( gIdx == 0 )
+	{
+		dst[cb.m_numBlocks] = sum;
+	}
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernels.hlsl
@@ -0,0 +1,157 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+
+//	takahiro end
+#define WG_SIZE 128
+
+#define GET_GROUP_SIZE WG_SIZE
+
+
+cbuffer SortCB : register( b0 )
+{
+	int m_numElems;
+	int m_numBlocks;
+	int m_numScanBlocks;
+};
+ 
+RWStructuredBuffer<uint> dst : register( u0 );
+RWStructuredBuffer<uint> src : register( u1 );
+RWStructuredBuffer<uint> sumBuffer : register( u2 );
+
+
+groupshared u32 ldsData[2048];
+
+u32 ScanExclusive(u32 n, int lIdx, int lSize)
+{
+	u32 blocksum;
+    int offset = 1;
+    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
+    {
+        GROUP_LDS_BARRIER;
+        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
+        {
+            int ai = offset*(2*iIdx+1)-1;
+            int bi = offset*(2*iIdx+2)-1;
+            ldsData[bi] += ldsData[ai];
+        }
+	}
+
+    GROUP_LDS_BARRIER;
+
+    if( lIdx == 0 )
+	{
+		blocksum = ldsData[ n-1 ];
+        ldsData[ n-1 ] = 0;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	offset >>= 1;
+    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
+    {
+        GROUP_LDS_BARRIER;
+        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
+        {
+            int ai = offset*(2*iIdx+1)-1;
+            int bi = offset*(2*iIdx+2)-1;
+            u32 temp = ldsData[ai];
+            ldsData[ai] = ldsData[bi];
+            ldsData[bi] += temp;
+        }
+	}
+	GROUP_LDS_BARRIER;
+
+	return blocksum;
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void LocalScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)
+{
+	int gIdx = GET_GLOBAL_IDX;
+	int lIdx = GET_LOCAL_IDX;
+
+	ldsData[2*lIdx]     = ( 2*gIdx < m_numElems )? src[2*gIdx]: 0;
+	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < m_numElems )? src[2*gIdx + 1]: 0;
+
+	u32 sum = ScanExclusive(WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
+
+	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;
+
+	if( (2*gIdx) < m_numElems )
+    {
+        dst[2*gIdx]     = ldsData[2*lIdx];
+	}
+	if( (2*gIdx + 1) < m_numElems )
+	{
+        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
+    }
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void TopLevelScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)
+{
+	int gIdx = GET_GLOBAL_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	int lSize = GET_GROUP_SIZE;
+
+	for(int i=lIdx; i<m_numScanBlocks; i+=lSize )
+	{
+		ldsData[i] = (i<m_numBlocks)? dst[i]:0;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	u32 sum = ScanExclusive(m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
+
+	for(int i=lIdx; i<m_numBlocks; i+=lSize )
+	{
+		dst[i] = ldsData[i];
+	}
+
+	if( gIdx == 0 )
+	{
+		dst[m_numBlocks] = sum;
+	}
+}
+
+
+ 
+RWStructuredBuffer<uint> blockSum2 : register( u1 );
+
+[numthreads(WG_SIZE, 1, 1)]
+void AddOffsetKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)
+{
+	const u32 blockSize = WG_SIZE*2;
+
+	int myIdx = GET_GROUP_IDX+1;
+	int llIdx = GET_LOCAL_IDX;
+
+	u32 iBlockSum = blockSum2[myIdx];
+
+	int endValue = min((myIdx+1)*(blockSize), m_numElems);
+	for(int i=myIdx*blockSize+llIdx; i<endValue; i+=GET_GROUP_SIZE)
+	{
+		dst[i] += iBlockSum;
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernelsCL.h
@@ -0,0 +1,143 @@
+static const char* prefixScanKernelsCL= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"\n"
+"// takahiro end\n"
+"#define WG_SIZE 128\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	uint m_numElems;\n"
+"	uint m_numBlocks;\n"
+"	uint m_numScanBlocks;\n"
+"	uint m_padding[1];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
+"{\n"
+"	u32 blocksum;\n"
+"    int offset = 1;\n"
+"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
+"    {\n"
+"        GROUP_LDS_BARRIER;\n"
+"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
+"        {\n"
+"            int ai = offset*(2*iIdx+1)-1;\n"
+"            int bi = offset*(2*iIdx+2)-1;\n"
+"            data[bi] += data[ai];\n"
+"        }\n"
+"	}\n"
+"\n"
+"    GROUP_LDS_BARRIER;\n"
+"\n"
+"    if( lIdx == 0 )\n"
+"	{\n"
+"		blocksum = data[ n-1 ];\n"
+"        data[ n-1 ] = 0;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	offset >>= 1;\n"
+"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
+"    {\n"
+"        GROUP_LDS_BARRIER;\n"
+"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
+"        {\n"
+"            int ai = offset*(2*iIdx+1)-1;\n"
+"            int bi = offset*(2*iIdx+2)-1;\n"
+"            u32 temp = data[ai];\n"
+"            data[ai] = data[bi];\n"
+"            data[bi] += temp;\n"
+"        }\n"
+"	}\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	return blocksum;\n"
+"}\n"
+"\n"
+"\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"__kernel\n"
+"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
+"		ConstBuffer cb)\n"
+"{\n"
+"	__local u32 ldsData[WG_SIZE*2];\n"
+"\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"\n"
+"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
+"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
+"\n"
+"	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+"\n"
+"	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
+"\n"
+"	if( (2*gIdx) < cb.m_numElems )\n"
+"    {\n"
+"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
+"	}\n"
+"	if( (2*gIdx + 1) < cb.m_numElems )\n"
+"	{\n"
+"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
+"    }\n"
+"}\n"
+"\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"__kernel\n"
+"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, ConstBuffer cb)\n"
+"{\n"
+"	const u32 blockSize = WG_SIZE*2;\n"
+"\n"
+"	int myIdx = GET_GROUP_IDX+1;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"\n"
+"	u32 iBlockSum = blockSum[myIdx];\n"
+"\n"
+"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
+"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
+"	{\n"
+"		dst[i] += iBlockSum;\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"__kernel\n"
+"void TopLevelScanKernel(__global u32* dst, ConstBuffer cb)\n"
+"{\n"
+"	__local u32 ldsData[2048];\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int lSize = GET_GROUP_SIZE;\n"
+"\n"
+"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
+"	{\n"
+"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+"\n"
+"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
+"	{\n"
+"		dst[i] = ldsData[i];\n"
+"	}\n"
+"\n"
+"	if( gIdx == 0 )\n"
+"	{\n"
+"		dst[cb.m_numBlocks] = sum;\n"
+"	}\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernelsDX11.h
@@ -0,0 +1,147 @@
+static const char* prefixScanKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"\n"
+"//	takahiro end\n"
+"#define WG_SIZE 128\n"
+"\n"
+"#define GET_GROUP_SIZE WG_SIZE\n"
+"\n"
+"\n"
+"cbuffer SortCB : register( b0 )\n"
+"{\n"
+"	int m_numElems;\n"
+"	int m_numBlocks;\n"
+"	int m_numScanBlocks;\n"
+"};\n"
+" \n"
+"RWStructuredBuffer<uint> dst : register( u0 );\n"
+"RWStructuredBuffer<uint> src : register( u1 );\n"
+"RWStructuredBuffer<uint> sumBuffer : register( u2 );\n"
+"\n"
+"\n"
+"groupshared u32 ldsData[2048];\n"
+"\n"
+"u32 ScanExclusive(u32 n, int lIdx, int lSize)\n"
+"{\n"
+"	u32 blocksum;\n"
+"    int offset = 1;\n"
+"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
+"    {\n"
+"        GROUP_LDS_BARRIER;\n"
+"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
+"        {\n"
+"            int ai = offset*(2*iIdx+1)-1;\n"
+"            int bi = offset*(2*iIdx+2)-1;\n"
+"            ldsData[bi] += ldsData[ai];\n"
+"        }\n"
+"	}\n"
+"\n"
+"    GROUP_LDS_BARRIER;\n"
+"\n"
+"    if( lIdx == 0 )\n"
+"	{\n"
+"		blocksum = ldsData[ n-1 ];\n"
+"        ldsData[ n-1 ] = 0;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	offset >>= 1;\n"
+"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
+"    {\n"
+"        GROUP_LDS_BARRIER;\n"
+"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
+"        {\n"
+"            int ai = offset*(2*iIdx+1)-1;\n"
+"            int bi = offset*(2*iIdx+2)-1;\n"
+"            u32 temp = ldsData[ai];\n"
+"            ldsData[ai] = ldsData[bi];\n"
+"            ldsData[bi] += temp;\n"
+"        }\n"
+"	}\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	return blocksum;\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void LocalScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"\n"
+"	ldsData[2*lIdx]     = ( 2*gIdx < m_numElems )? src[2*gIdx]: 0;\n"
+"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < m_numElems )? src[2*gIdx + 1]: 0;\n"
+"\n"
+"	u32 sum = ScanExclusive(WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+"\n"
+"	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
+"\n"
+"	if( (2*gIdx) < m_numElems )\n"
+"    {\n"
+"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
+"	}\n"
+"	if( (2*gIdx + 1) < m_numElems )\n"
+"	{\n"
+"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
+"    }\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void TopLevelScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int lSize = GET_GROUP_SIZE;\n"
+"\n"
+"	for(int i=lIdx; i<m_numScanBlocks; i+=lSize )\n"
+"	{\n"
+"		ldsData[i] = (i<m_numBlocks)? dst[i]:0;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	u32 sum = ScanExclusive(m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+"\n"
+"	for(int i=lIdx; i<m_numBlocks; i+=lSize )\n"
+"	{\n"
+"		dst[i] = ldsData[i];\n"
+"	}\n"
+"\n"
+"	if( gIdx == 0 )\n"
+"	{\n"
+"		dst[m_numBlocks] = sum;\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+" \n"
+"RWStructuredBuffer<uint> blockSum2 : register( u1 );\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void AddOffsetKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
+"{\n"
+"	const u32 blockSize = WG_SIZE*2;\n"
+"\n"
+"	int myIdx = GET_GROUP_IDX+1;\n"
+"	int llIdx = GET_LOCAL_IDX;\n"
+"\n"
+"	u32 iBlockSum = blockSum2[myIdx];\n"
+"\n"
+"	int endValue = min((myIdx+1)*(blockSize), m_numElems);\n"
+"	for(int i=myIdx*blockSize+llIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
+"	{\n"
+"		dst[i] += iBlockSum;\n"
+"	}\n"
+"}\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearch.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearch.h
@@ -0,0 +1,73 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+#include <AdlPrimitives/Sort/SortData.h>
+#include <AdlPrimitives/Fill/Fill.h>
+
+namespace adl
+{
+
+class BoundSearchBase
+{
+	public:
+		enum Option
+		{
+			BOUND_LOWER,
+			BOUND_UPPER,
+			COUNT,
+		};
+};
+
+template<DeviceType TYPE>
+class BoundSearch : public BoundSearchBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			const Device* m_device;
+			Kernel* m_lowerSortDataKernel;
+			Kernel* m_upperSortDataKernel;
+			Kernel* m_subtractKernel;
+			Buffer<int4>* m_constBuffer;
+			Buffer<u32>* m_lower;
+			Buffer<u32>* m_upper;
+			typename Fill<TYPE>::Data* m_fillData;
+		};
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize = 0);
+
+		static
+		void deallocate(Data* data);
+
+		//	src has to be src[i].m_key <= src[i+1].m_key
+		static
+		void execute(Data* data, Buffer<SortData>& src, u32 nSrc, Buffer<u32>& dst, u32 nDst, Option option = BOUND_LOWER );
+
+//		static
+//		void execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, Option option = );
+};
+
+#include <AdlPrimitives/Search/BoundSearchHost.inl>
+#include <AdlPrimitives/Search/BoundSearch.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearch.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearch.inl
@@ -0,0 +1,128 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Search\\BoundSearchKernels"
+#define KERNEL0 "SearchSortDataLowerKernel"
+#define KERNEL1 "SearchSortDataUpperKernel"
+#define KERNEL2 "SubtractKernel"
+
+#include <AdlPrimitives/Search/BoundSearchKernelsCL.h>
+#include <AdlPrimitives/Search/BoundSearchKernelsDX11.h>
+
+template<DeviceType TYPE>
+typename BoundSearch<TYPE>::Data* BoundSearch<TYPE>::allocate(const Device* device, int maxSize)
+{
+	ADLASSERT( TYPE == device->m_type );
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{boundSearchKernelsCL, boundSearchKernelsDX11};
+#else
+		{0,0};
+#endif
+
+	Data* data = new Data;
+
+	data->m_device = device;
+	data->m_lowerSortDataKernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
+	data->m_upperSortDataKernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
+	data->m_constBuffer = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
+	if( maxSize )
+	{
+		data->m_subtractKernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
+	}
+	data->m_lower = (maxSize == 0)? 0: new Buffer<u32>( device, maxSize );
+	data->m_upper = (maxSize == 0)? 0: new Buffer<u32>( device, maxSize );
+	data->m_fillData = (maxSize == 0)? 0: Fill<TYPE>::allocate( device );
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void BoundSearch<TYPE>::deallocate(Data* data)
+{
+	delete data->m_constBuffer;
+	if( data->m_lower ) delete data->m_lower;
+	if( data->m_upper ) delete data->m_upper;
+	if( data->m_fillData ) Fill<TYPE>::deallocate( data->m_fillData );
+	delete data;
+}
+
+template<DeviceType TYPE>
+void BoundSearch<TYPE>::execute(Data* data, Buffer<SortData>& src, u32 nSrc, Buffer<u32>& dst, u32 nDst, Option option )
+{
+	int4 constBuffer;
+	constBuffer.x = nSrc;
+	constBuffer.y = nDst;
+
+	Buffer<SortData>* srcNative = BufferUtils::map<TYPE, true>( data->m_device, &src );
+	Buffer<u32>* dstNative = BufferUtils::map<TYPE, false>( data->m_device, &dst );
+
+	if( option == BOUND_LOWER )
+	{
+		BufferInfo bInfo[] = { BufferInfo( srcNative, true ), BufferInfo( dstNative ) };
+
+		Launcher launcher( data->m_device, data->m_lowerSortDataKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer, constBuffer );
+		launcher.launch1D( nSrc, 64 );
+	}
+	else if( option == BOUND_UPPER )
+	{
+		BufferInfo bInfo[] = { BufferInfo( srcNative, true ), BufferInfo( dstNative ) };
+
+		Launcher launcher( data->m_device, data->m_upperSortDataKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer, constBuffer );
+		launcher.launch1D( nSrc+1, 64 );
+	}
+	else if( option == COUNT )
+	{
+		ADLASSERT( data->m_lower );
+		ADLASSERT( data->m_upper );
+		ADLASSERT( data->m_lower->getSize() <= (int)nDst );
+		ADLASSERT( data->m_upper->getSize() <= (int)nDst );
+
+		int zero = 0;
+		Fill<TYPE>::execute( data->m_fillData, (Buffer<int>&)*data->m_lower, zero, nDst );
+		Fill<TYPE>::execute( data->m_fillData, (Buffer<int>&)*data->m_upper, zero, nDst );
+
+		execute( data, src, nSrc, *data->m_lower, nDst, BOUND_LOWER );
+		execute( data, src, nSrc, *data->m_upper, nDst, BOUND_UPPER );
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( data->m_upper, true ), BufferInfo( data->m_lower, true ), BufferInfo( dstNative ) };
+
+			Launcher launcher( data->m_device, data->m_subtractKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( nDst, 64 );
+		}
+	}
+	else
+	{
+		ADLASSERT( 0 );
+	}
+
+	BufferUtils::unmap<false>( srcNative, &src );
+	BufferUtils::unmap<true>( dstNative, &dst );
+}
+
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchHost.inl
@@ -0,0 +1,111 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+template<>
+class BoundSearch<TYPE_HOST> : public BoundSearchBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			const Device* m_device;
+		};
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize = 0)
+		{
+			ADLASSERT( deviceData->m_type == TYPE_HOST );
+			Data* data = new Data;
+			data->m_device = deviceData;
+			return data;
+		}
+
+		static
+		void deallocate(Data* data)
+		{
+			delete data;
+		}
+
+		static
+		void execute(Data* data, Buffer<SortData>& rawSrc, u32 nSrc, Buffer<u32>& rawDst, u32 nDst, Option option = BOUND_LOWER)
+		{
+			ADLASSERT( rawSrc.getType() == TYPE_HOST );
+			ADLASSERT( rawDst.getType() == TYPE_HOST );
+
+			HostBuffer<SortData>& src = *(HostBuffer<SortData>*)&rawSrc;
+			HostBuffer<u32>& dst = *(HostBuffer<u32>*)&rawDst;
+
+			for(int i=0; i<nSrc-1; i++) 
+				ADLASSERT( src[i].m_key <= src[i+1].m_key );
+
+			if( option == BOUND_LOWER )
+			{
+				for(u32 i=0; i<nSrc; i++)
+				{
+					SortData& iData = (i==0)? SortData(-1,-1): src[i-1];
+					SortData& jData = (i==nSrc)? SortData(nDst, nDst): src[i];
+
+					if( iData.m_key != jData.m_key )
+					{
+//						for(u32 k=iData.m_key+1; k<=min(jData.m_key,nDst-1); k++)
+						u32 k = jData.m_key;
+						{
+							dst[k] = i;
+						}
+					}
+				}
+			}
+			else if( option == BOUND_UPPER )
+			{
+				for(u32 i=0; i<nSrc+1; i++)
+				{
+					SortData& iData = (i==0)? SortData(0,0): src[i-1];
+					SortData& jData = (i==nSrc)? SortData(nDst, nDst): src[i];
+
+					if( iData.m_key != jData.m_key )
+					{
+//						for(u32 k=iData.m_key; k<min(jData.m_key,nDst); k++)
+						u32 k = iData.m_key;
+						{
+							dst[k] = i;
+						}
+					}
+				}
+			}
+			else if( option == COUNT )
+			{
+				HostBuffer<u32> lower( data->m_device, nDst );
+				HostBuffer<u32> upper( data->m_device, nDst );
+
+				for(u32 i=0; i<nDst; i++) { lower[i] = upper[i] = 0; }
+
+				execute( data, rawSrc, nSrc, lower, nDst, BOUND_LOWER );
+				execute( data, rawSrc, nSrc, upper, nDst, BOUND_UPPER );
+
+				for(u32 i=0; i<nDst; i++) { dst[i] = upper[i] - lower[i]; }
+			}
+			else
+			{
+				ADLASSERT( 0 );
+			}
+		}
+
+//		static
+//		void execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, Option option = );
+};
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernels.cl
@@ -0,0 +1,112 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+
+
+typedef struct
+{
+	u32 m_nSrc;
+	u32 m_nDst;
+	u32 m_padding[2];
+} ConstBuffer;
+
+
+
+__attribute__((reqd_work_group_size(64,1,1)))
+__kernel
+void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = cb.m_nSrc;
+	u32 nDst = cb.m_nDst;
+
+	if( gIdx < nSrc )
+	{
+		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);
+		SortData end; end.m_key = nDst; end.m_value = nDst;
+
+		SortData iData = (gIdx==0)? first: src[gIdx-1];
+		SortData jData = (gIdx==nSrc)? end: src[gIdx];
+
+		if( iData.m_key != jData.m_key )
+		{
+//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
+			u32 k = jData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+}
+
+
+__attribute__((reqd_work_group_size(64,1,1)))
+__kernel
+void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = cb.m_nSrc;
+	u32 nDst = cb.m_nDst;
+
+	if( gIdx < nSrc+1 )
+	{
+		SortData first; first.m_key = 0; first.m_value = 0;
+		SortData end; end.m_key = nDst; end.m_value = nDst;
+
+		SortData iData = (gIdx==0)? first: src[gIdx-1];
+		SortData jData = (gIdx==nSrc)? end: src[gIdx];
+
+		if( iData.m_key != jData.m_key )
+		{
+//			for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)
+			u32 k = iData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+}
+
+__attribute__((reqd_work_group_size(64,1,1)))
+__kernel
+void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = cb.m_nSrc;
+	u32 nDst = cb.m_nDst;
+
+	if( gIdx < nDst )
+	{
+		C[gIdx] = A[gIdx] - B[gIdx];
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernels.hlsl
@@ -0,0 +1,104 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
+#define AtomInc(x) InterlockedAdd(x, 1)
+#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
+
+
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+
+
+cbuffer SortCB : register( b0 )
+{
+	u32 m_nSrc;
+	u32 m_nDst;
+	u32 m_padding[2];
+};
+
+
+StructuredBuffer<SortData> src : register( t0 );
+RWStructuredBuffer<u32> dst : register( u0 );
+
+
+[numthreads(64, 1, 1)]
+void SearchSortDataLowerKernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = m_nSrc;
+	u32 nDst = m_nDst;
+
+	if( gIdx < nSrc )
+	{
+		SortData iData;
+		SortData jData;
+		if( gIdx==0 ) iData.m_key = iData.m_value = (u32)-1;
+		else iData = src[gIdx-1];
+
+		if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;
+		else jData = src[gIdx];
+
+		if( iData.m_key != jData.m_key )
+		{
+//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
+			u32 k = jData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+}
+
+[numthreads(64, 1, 1)]
+void SearchSortDataUpperKernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = m_nSrc;
+	u32 nDst = m_nDst;
+
+	if( gIdx < nSrc+1 )
+	{
+		SortData iData;
+		SortData jData;
+		if( gIdx==0 ) iData.m_key = iData.m_value = 0;
+		else iData = src[gIdx-1];
+
+		if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;
+		else jData = src[gIdx];
+
+		if( iData.m_key != jData.m_key )
+		{
+//			for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)
+			u32 k = iData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernelsCL.h
@@ -0,0 +1,102 @@
+static const char* boundSearchKernelsCL= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_nSrc;\n"
+"	u32 m_nDst;\n"
+"	u32 m_padding[2];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"__kernel\n"
+"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = cb.m_nSrc;\n"
+"	u32 nDst = cb.m_nDst;\n"
+"\n"
+"	if( gIdx < nSrc )\n"
+"	{\n"
+"		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
+"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+"\n"
+"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
+"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+"\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
+"			u32 k = jData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"__kernel\n"
+"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = cb.m_nSrc;\n"
+"	u32 nDst = cb.m_nDst;\n"
+"\n"
+"	if( gIdx < nSrc+1 )\n"
+"	{\n"
+"		SortData first; first.m_key = 0; first.m_value = 0;\n"
+"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+"\n"
+"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
+"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+"\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"//			for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)\n"
+"			u32 k = iData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"__kernel\n"
+"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = cb.m_nSrc;\n"
+"	u32 nDst = cb.m_nDst;\n"
+"\n"
+"	if( gIdx < nDst )\n"
+"	{\n"
+"		C[gIdx] = A[gIdx] - B[gIdx];\n"
+"	}\n"
+"}\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernelsDX11.h
@@ -0,0 +1,94 @@
+static const char* boundSearchKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+"\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"\n"
+"\n"
+"cbuffer SortCB : register( b0 )\n"
+"{\n"
+"	u32 m_nSrc;\n"
+"	u32 m_nDst;\n"
+"	u32 m_padding[2];\n"
+"};\n"
+"\n"
+"\n"
+"StructuredBuffer<SortData> src : register( t0 );\n"
+"RWStructuredBuffer<u32> dst : register( u0 );\n"
+"\n"
+"\n"
+"[numthreads(64, 1, 1)]\n"
+"void SearchSortDataLowerKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = m_nSrc;\n"
+"	u32 nDst = m_nDst;\n"
+"\n"
+"	if( gIdx < nSrc )\n"
+"	{\n"
+"		SortData iData;\n"
+"		SortData jData;\n"
+"		if( gIdx==0 ) iData.m_key = iData.m_value = (u32)-1;\n"
+"		else iData = src[gIdx-1];\n"
+"\n"
+"		if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;\n"
+"		else jData = src[gIdx];\n"
+"\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
+"			u32 k = jData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"[numthreads(64, 1, 1)]\n"
+"void SearchSortDataUpperKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = m_nSrc;\n"
+"	u32 nDst = m_nDst;\n"
+"\n"
+"	if( gIdx < nSrc+1 )\n"
+"	{\n"
+"		SortData iData;\n"
+"		SortData jData;\n"
+"		if( gIdx==0 ) iData.m_key = iData.m_value = 0;\n"
+"		else iData = src[gIdx-1];\n"
+"\n"
+"		if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;\n"
+"		else jData = src[gIdx];\n"
+"\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"//			for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)\n"
+"			u32 k = iData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort.h
@@ -0,0 +1,53 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+#include <AdlPrimitives/Sort/SortData.h>
+#include <AdlPrimitives/Scan/PrefixScan.h>
+
+namespace adl
+{
+
+class RadixSortBase
+{
+	public:
+		enum Option
+		{
+			SORT_SIMPLE,
+			SORT_STANDARD, 
+			SORT_ADVANCED
+		};
+};
+
+template<DeviceType TYPE>
+class RadixSort : public RadixSortBase
+{
+	public:
+		struct Data
+		{
+			Option m_option;
+			const Device* m_deviceData;
+			typename PrefixScan<TYPE>::Data* m_scanData;
+			int m_maxSize;
+		};
+		
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_STANDARD);
+
+		static
+		void deallocate(Data* data);
+
+		static
+		void execute(Data* data, Buffer<SortData>& inout, int n, int sortBits = 32);
+};
+
+
+#include <AdlPrimitives/Sort/RadixSort.inl>
+#include <AdlPrimitives/Sort/RadixSortHost.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort.inl
@@ -0,0 +1,58 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#include <AdlPrimitives/Sort/RadixSortSimple.inl>
+#include <AdlPrimitives/Sort/RadixSortStandard.inl>
+#include <AdlPrimitives/Sort/RadixSortAdvanced.inl>
+
+
+#define DISPATCH_IMPL(x) \
+	switch( data->m_option ) \
+	{ \
+		case SORT_SIMPLE: RadixSortSimple<TYPE>::x; break; \
+		case SORT_STANDARD: RadixSortStandard<TYPE>::x; break; \
+		case SORT_ADVANCED: RadixSortAdvanced<TYPE>::x; break; \
+		default:ADLASSERT(0);break; \
+	}
+
+template<DeviceType TYPE>
+typename RadixSort<TYPE>::Data* RadixSort<TYPE>::allocate(const Device* deviceData, int maxSize, Option option)
+{
+	ADLASSERT( TYPE == deviceData->m_type );
+
+	void* dataOut;
+	switch( option )
+	{
+	case SORT_SIMPLE:
+		dataOut = RadixSortSimple<TYPE>::allocate( deviceData, maxSize, option );
+		break;
+	case SORT_STANDARD:
+		dataOut = RadixSortStandard<TYPE>::allocate( deviceData, maxSize, option );
+		break;
+	case SORT_ADVANCED:
+		dataOut = RadixSortAdvanced<TYPE>::allocate( deviceData, maxSize, option );
+		break;
+	default:
+		ADLASSERT(0);
+		break;
+	}
+	return (typename RadixSort<TYPE>::Data*)dataOut;
+}
+
+template<DeviceType TYPE>
+void RadixSort<TYPE>::deallocate(Data* data)
+{
+	DISPATCH_IMPL( deallocate( data ) );
+}
+
+template<DeviceType TYPE>
+void RadixSort<TYPE>::execute(Data* data, Buffer<SortData>& inout, int n, int sortBits)
+{
+	DISPATCH_IMPL( execute( data, inout, n, sortBits ) );
+}
+
+
+#undef DISPATCH_IMPL
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32.h
@@ -0,0 +1,98 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+#include <AdlPrimitives/Copy/Copy.h>
+#include <AdlPrimitives/Sort/SortData.h>
+
+namespace adl
+{
+
+class RadixSort32Base
+{
+	public:
+// 		enum Option
+// 		{
+// 			SORT_SIMPLE,
+// 			SORT_STANDARD, 
+// 			SORT_ADVANCED
+// 		};
+};
+
+template<DeviceType TYPE>
+class RadixSort32 : public RadixSort32Base
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		enum
+		{
+			DATA_ALIGNMENT = 256,
+			WG_SIZE = 64,
+			ELEMENTS_PER_WORK_ITEM = (256/WG_SIZE),
+			BITS_PER_PASS = 4,
+
+			//	if you change this, change nPerWI in kernel as well
+			NUM_WGS = 20*6,	//	cypress
+//			NUM_WGS = 24*6,	//	cayman
+//			NUM_WGS = 32*4,	//	nv
+		};
+
+		struct ConstData
+		{
+			int m_n;
+			int m_nWGs;
+			int m_startBit;
+			int m_nBlocksPerWG;
+		};
+
+		struct Data
+		{
+			const Device* m_device;
+			int m_maxSize;
+
+			Kernel* m_streamCountKernel;
+			Kernel* m_streamCountSortDataKernel;
+			Kernel* m_prefixScanKernel;
+			Kernel* m_sortAndScatterKernel;
+			Kernel* m_sortAndScatterKeyValueKernel;
+			Kernel* m_sortAndScatterSortDataKernel;
+
+			Buffer<u32>* m_workBuffer0;
+			Buffer<u32>* m_workBuffer1;
+			Buffer<u32>* m_workBuffer2;
+			Buffer<SortData>* m_workBuffer3;
+
+			Buffer<ConstData>* m_constBuffer[32/BITS_PER_PASS];
+
+			typename Copy<TYPE>::Data* m_copyData;
+		};
+
+		static
+		Data* allocate(const Device* device, int maxSize);
+
+		static
+		void deallocate(Data* data);
+
+		static
+		void execute(Data* data, Buffer<u32>& inout, int n, int sortBits = 32);
+
+		static
+		void execute(Data* data, Buffer<u32>& in, Buffer<u32>& out, int n, int sortBits = 32);
+
+		static
+		void execute(Data* data, Buffer<u32>& keysIn, Buffer<u32>& keysOut, Buffer<u32>& valuesIn, Buffer<u32>& valuesOut, int n, int sortBits = 32);
+		
+		static
+		void execute(Data* data, Buffer<SortData>& keyValuesInOut, int n, int sortBits = 32 );
+};
+
+
+#include <AdlPrimitives/Sort/RadixSort32Host.inl>
+#include <AdlPrimitives/Sort/RadixSort32.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32.inl
@@ -0,0 +1,346 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Sort\\RadixSort32Kernels"
+#define RADIXSORT32_KERNEL0 "StreamCountKernel"
+#define RADIXSORT32_KERNEL1 "PrefixScanKernel"
+#define RADIXSORT32_KERNEL2 "SortAndScatterKernel"
+#define RADIXSORT32_KERNEL3 "SortAndScatterKeyValueKernel"
+#define RADIXSORT32_KERNEL4 "SortAndScatterSortDataKernel"
+#define RADIXSORT32_KERNEL5 "StreamCountSortDataKernel"
+
+#include "RadixSort32KernelsCL.h"
+#include "RadixSort32KernelsDX11.h"
+
+//	todo. Shader compiler (2010JuneSDK) doesn't allow me to place Barriers in SortAndScatterKernel... 
+//	So it only works on a GPU with 64 wide SIMD. 
+
+template<DeviceType TYPE>
+typename RadixSort32<TYPE>::Data* RadixSort32<TYPE>::allocate( const Device* device, int maxSize )
+{
+	ADLASSERT( TYPE == device->m_type );
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+	{radixSort32KernelsCL, radixSort32KernelsDX11};
+#else
+	{0,0};
+#endif
+	
+	Data* data = new Data;
+	data->m_device = device;
+	data->m_maxSize = maxSize;
+	data->m_streamCountKernel = device->getKernel( PATH, RADIXSORT32_KERNEL0, 0, src[TYPE] );
+	data->m_streamCountSortDataKernel = device->getKernel( PATH, RADIXSORT32_KERNEL5, 0, src[TYPE] );
+
+	
+
+	data->m_prefixScanKernel = device->getKernel( PATH, RADIXSORT32_KERNEL1, 0, src[TYPE] );
+	data->m_sortAndScatterKernel = device->getKernel( PATH, RADIXSORT32_KERNEL2, 0, src[TYPE] );
+	data->m_sortAndScatterKeyValueKernel = device->getKernel( PATH, RADIXSORT32_KERNEL3, 0, src[TYPE] );
+	data->m_sortAndScatterSortDataKernel = device->getKernel( PATH, RADIXSORT32_KERNEL4, 0, src[TYPE] );
+
+	int wtf = NUM_WGS*(1<<BITS_PER_PASS);
+
+	data->m_workBuffer0 = new Buffer<u32>( device, maxSize );
+	data->m_workBuffer1 = new Buffer<u32>( device , wtf );
+	data->m_workBuffer2 = new Buffer<u32>( device, maxSize );
+	data->m_workBuffer3 = new Buffer<SortData>(device,maxSize);
+
+
+	for(int i=0; i<32/BITS_PER_PASS; i++)
+		data->m_constBuffer[i] = new Buffer<ConstData>( device, 1, BufferBase::BUFFER_CONST );
+
+	data->m_copyData = Copy<TYPE>::allocate( device );
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void RadixSort32<TYPE>::deallocate( Data* data )
+{
+	delete data->m_workBuffer0;
+	delete data->m_workBuffer1;
+	delete data->m_workBuffer2;
+	delete data->m_workBuffer3;
+
+	for(int i=0; i<32/BITS_PER_PASS; i++)
+		delete data->m_constBuffer[i];
+
+	Copy<TYPE>::deallocate( data->m_copyData );
+
+	delete data;
+}
+
+template<DeviceType TYPE>
+void RadixSort32<TYPE>::execute(Data* data, Buffer<u32>& inout, int n, int sortBits /* = 32 */ )
+{
+	ADLASSERT( n%DATA_ALIGNMENT == 0 );
+	ADLASSERT( n <= data->m_maxSize );
+//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+	ADLASSERT( BITS_PER_PASS == 4 );
+	ADLASSERT( WG_SIZE == 64 );
+	ADLASSERT( (sortBits&0x3) == 0 );
+
+	Buffer<u32>* src = &inout;
+	Buffer<u32>* dst = data->m_workBuffer0;
+	Buffer<u32>* histogramBuffer = data->m_workBuffer1;
+
+	int nWGs = NUM_WGS;
+	ConstData cdata;
+	{
+		int nBlocks = (n+ELEMENTS_PER_WORK_ITEM*WG_SIZE-1)/(ELEMENTS_PER_WORK_ITEM*WG_SIZE);
+
+		cdata.m_n = n;
+		cdata.m_nWGs = NUM_WGS;
+		cdata.m_startBit = 0;
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
+
+		if( nBlocks < NUM_WGS )
+		{
+			cdata.m_nBlocksPerWG = 1;
+			nWGs = nBlocks;
+		}
+	}
+
+	for(int ib=0; ib<sortBits; ib+=4)
+	{
+		cdata.m_startBit = ib;
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_streamCountKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( NUM_WGS*WG_SIZE, WG_SIZE );
+		}
+		{//	prefix scan group histogram
+			BufferInfo bInfo[] = { BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_prefixScanKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( 128, 128 );
+		}
+		{//	local sort and distribute
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer, true ), BufferInfo( dst ) };
+			Launcher launcher( data->m_device, data->m_sortAndScatterKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
+		}
+		swap2( src, dst );
+	}
+
+	if( src != &inout )
+	{
+		Copy<TYPE>::execute( data->m_copyData, (Buffer<float>&)inout, (Buffer<float>&)*src, n );
+	}
+}
+
+template<DeviceType TYPE>
+void RadixSort32<TYPE>::execute(Data* data, Buffer<u32>& in, Buffer<u32>& out, int n, int sortBits /* = 32 */ )
+{
+	ADLASSERT( n%DATA_ALIGNMENT == 0 );
+	ADLASSERT( n <= data->m_maxSize );
+//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+	ADLASSERT( BITS_PER_PASS == 4 );
+	ADLASSERT( WG_SIZE == 64 );
+	ADLASSERT( (sortBits&0x3) == 0 );
+
+	Buffer<u32>* src = &in;
+	Buffer<u32>* dst = data->m_workBuffer0;
+	Buffer<u32>* histogramBuffer = data->m_workBuffer1;
+
+	int nWGs = NUM_WGS;
+	ConstData cdata;
+	{
+		int nBlocks = (n+ELEMENTS_PER_WORK_ITEM*WG_SIZE-1)/(ELEMENTS_PER_WORK_ITEM*WG_SIZE);
+		cdata.m_n = n;
+		cdata.m_nWGs = NUM_WGS;
+		cdata.m_startBit = 0;
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
+		if( nBlocks < NUM_WGS )
+		{
+			cdata.m_nBlocksPerWG = 1;
+			nWGs = nBlocks;
+		}
+	}
+
+	if( sortBits == 4 ) dst = &out;
+
+	for(int ib=0; ib<sortBits; ib+=4)
+	{
+		if( ib==4 )
+		{
+			dst = &out;
+		}
+
+		cdata.m_startBit = ib;
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_streamCountKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( NUM_WGS*WG_SIZE, WG_SIZE );
+		}
+		{//	prefix scan group histogram
+			BufferInfo bInfo[] = { BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_prefixScanKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( 128, 128 );
+		}
+		{//	local sort and distribute
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer, true ), BufferInfo( dst ) };
+			Launcher launcher( data->m_device, data->m_sortAndScatterKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
+		}
+		swap2( src, dst );
+	}
+}
+
+template<DeviceType TYPE>
+void RadixSort32<TYPE>::execute(Data* data, Buffer<u32>& keysIn, Buffer<u32>& keysOut, Buffer<u32>& valuesIn, Buffer<u32>& valuesOut, int n, int sortBits /* = 32 */)
+{
+	ADLASSERT( n%DATA_ALIGNMENT == 0 );
+	ADLASSERT( n <= data->m_maxSize );
+//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+	ADLASSERT( BITS_PER_PASS == 4 );
+	ADLASSERT( WG_SIZE == 64 );
+	ADLASSERT( (sortBits&0x3) == 0 );
+
+	Buffer<u32>* src = &keysIn;
+	Buffer<u32>* srcVal = &valuesIn;
+	Buffer<u32>* dst = data->m_workBuffer0;
+	Buffer<u32>* dstVal = data->m_workBuffer2;
+	Buffer<u32>* histogramBuffer = data->m_workBuffer1;
+
+	int nWGs = NUM_WGS;
+	ConstData cdata;
+	{
+		int nBlocks = (n+ELEMENTS_PER_WORK_ITEM*WG_SIZE-1)/(ELEMENTS_PER_WORK_ITEM*WG_SIZE);
+		cdata.m_n = n;
+		cdata.m_nWGs = NUM_WGS;
+		cdata.m_startBit = 0;
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
+		if( nBlocks < NUM_WGS )
+		{
+			cdata.m_nBlocksPerWG = 1;
+			nWGs = nBlocks;
+		}
+	}
+
+	if( sortBits == 4 ) 
+	{
+		dst = &keysOut;
+		dstVal = &valuesOut;
+	}
+
+	for(int ib=0; ib<sortBits; ib+=4)
+	{
+		if( ib==4 )
+		{
+			dst = &keysOut;
+			dstVal = &valuesOut;
+		}
+
+		cdata.m_startBit = ib;
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_streamCountKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( NUM_WGS*WG_SIZE, WG_SIZE );
+		}
+		{//	prefix scan group histogram
+			BufferInfo bInfo[] = { BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_prefixScanKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( 128, 128 );
+		}
+		{//	local sort and distribute
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( srcVal, true ), BufferInfo( histogramBuffer, true ), BufferInfo( dst ), BufferInfo( dstVal ) };
+			Launcher launcher( data->m_device, data->m_sortAndScatterKeyValueKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
+		}
+		swap2( src, dst );
+		swap2( srcVal, dstVal );
+	}
+}
+
+template<DeviceType TYPE>
+void RadixSort32<TYPE>::execute(Data* data, Buffer<SortData>& keyValuesInOut, int n, int sortBits /* = 32 */)
+{
+	ADLASSERT( n%DATA_ALIGNMENT == 0 );
+	ADLASSERT( n <= data->m_maxSize );
+//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+	ADLASSERT( BITS_PER_PASS == 4 );
+	ADLASSERT( WG_SIZE == 64 );
+	ADLASSERT( (sortBits&0x3) == 0 );
+
+	Buffer<SortData>* src = &keyValuesInOut;
+	Buffer<SortData>* dst = data->m_workBuffer3;
+
+	Buffer<u32>* histogramBuffer = data->m_workBuffer1;
+
+	int nWGs = NUM_WGS;
+	ConstData cdata;
+	{
+		int nBlocks = (n+ELEMENTS_PER_WORK_ITEM*WG_SIZE-1)/(ELEMENTS_PER_WORK_ITEM*WG_SIZE);
+		cdata.m_n = n;
+		cdata.m_nWGs = NUM_WGS;
+		cdata.m_startBit = 0;
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
+		if( nBlocks < NUM_WGS )
+		{
+			cdata.m_nBlocksPerWG = 1;
+			nWGs = nBlocks;
+		}
+	}
+
+	int count=0;
+	for(int ib=0; ib<sortBits; ib+=4)
+	{
+		cdata.m_startBit = ib;
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_streamCountSortDataKernel);
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( NUM_WGS*WG_SIZE, WG_SIZE );
+		}
+		{//	prefix scan group histogram
+			BufferInfo bInfo[] = { BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_prefixScanKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( 128, 128 );
+		}
+		{//	local sort and distribute
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer, true ), BufferInfo( dst )};
+			Launcher launcher( data->m_device, data->m_sortAndScatterSortDataKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
+		}
+		swap2( src, dst );
+		count++;
+	}
+	
+	if (count&1)
+	{
+		ADLASSERT(0);//need to copy from workbuffer to keyValuesInOut
+
+	}
+}
+#undef PATH
+#undef RADIXSORT32_KERNEL0
+#undef RADIXSORT32_KERNEL1
+#undef RADIXSORT32_KERNEL2
+#undef RADIXSORT32_KERNEL3
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32Host.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32Host.inl
@@ -0,0 +1,163 @@
+/*
+		2011 Takahiro Harada
+*/
+
+template<>
+class RadixSort32<TYPE_HOST> : public RadixSort32Base
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		enum
+		{
+			BITS_PER_PASS = 8,
+			NUM_TABLES = (1<<BITS_PER_PASS),
+		};
+
+		struct Data
+		{
+			HostBuffer<u32>* m_workBuffer;
+		};
+
+		static
+		Data* allocate(const Device* device, int maxSize)
+		{
+			ADLASSERT( device->m_type == TYPE_HOST );
+
+			Data* data = new Data;
+			data->m_workBuffer = new HostBuffer<u32>( device, maxSize );
+			return data;
+		}
+
+		static
+		void deallocate(Data* data)
+		{
+			delete data->m_workBuffer;
+			delete data;
+		}
+
+		static
+		void execute(Data* data, Buffer<u32>& inout, int n, int sortBits = 32)
+		{
+			ADLASSERT( inout.getType() == TYPE_HOST );
+
+			int tables[NUM_TABLES];
+			int counter[NUM_TABLES];
+
+			u32* src = inout.m_ptr;
+			u32* dst = data->m_workBuffer->m_ptr;
+
+			for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
+			{
+				for(int i=0; i<NUM_TABLES; i++)
+				{
+					tables[i] = 0;
+				}
+
+				for(int i=0; i<n; i++)
+				{
+					int tableIdx = (src[i] >> startBit) & (NUM_TABLES-1);
+					tables[tableIdx]++;
+				}
+
+				//	prefix scan
+				int sum = 0;
+				for(int i=0; i<NUM_TABLES; i++)
+				{
+					int iData = tables[i];
+					tables[i] = sum;
+					sum += iData;
+					counter[i] = 0;
+				}
+
+				//	distribute
+				for(int i=0; i<n; i++)
+				{
+					int tableIdx = (src[i] >> startBit) & (NUM_TABLES-1);
+
+					dst[tables[tableIdx] + counter[tableIdx]] = src[i];
+					counter[tableIdx] ++;
+				}
+
+				swap2( src, dst );
+			}
+
+			{
+				if( src != inout.m_ptr )
+				{
+					memcpy( dst, src, sizeof(u32)*n );
+				}
+			}
+
+		}
+
+		static
+		void execute(Data* data, Buffer<u32>& keyInout, const Buffer<u32>& valueInout, int n, int sortBits = 32)
+		{
+			ADLASSERT( keyInout.getType() == TYPE_HOST );
+
+			int tables[NUM_TABLES];
+			int counter[NUM_TABLES];
+
+			u32* src = keyInout.m_ptr;
+			u32* dst = data->m_workBuffer->m_ptr;
+
+			HostBuffer<u32> bufVal(valueInout.m_device, valueInout.m_size);
+			bufVal.write(valueInout.m_ptr, valueInout.m_size);
+
+			u32* srcVal = valueInout.m_ptr;
+			u32* dstVal = bufVal.m_ptr;
+
+			for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
+			{
+				for(int i=0; i<NUM_TABLES; i++)
+				{
+					tables[i] = 0;
+				}
+
+				for(int i=0; i<n; i++)
+				{
+					int tableIdx = (src[i] >> startBit) & (NUM_TABLES-1);
+					tables[tableIdx]++;
+				}
+
+				//	prefix scan
+				int sum = 0;
+				for(int i=0; i<NUM_TABLES; i++)
+				{
+					int iData = tables[i];
+					tables[i] = sum;
+					sum += iData;
+					counter[i] = 0;
+				}
+
+				//	distribute
+				for(int i=0; i<n; i++)
+				{
+					int tableIdx = (src[i] >> startBit) & (NUM_TABLES-1);
+					int newIdx = tables[tableIdx] + counter[tableIdx];
+					dst[newIdx] = src[i];
+					dstVal[newIdx] = srcVal[i];
+					counter[tableIdx]++;
+				}
+
+				swap2( src, dst );
+				swap2( srcVal, dstVal );
+			}
+
+			{
+				if( src != keyInout.m_ptr )
+				{
+					memcpy( dst, src, sizeof(u32)*n );
+				}
+
+				if( srcVal != valueInout.m_ptr )
+				{
+					memcpy( dstVal, srcVal, sizeof(u32)*n );
+				}
+			}
+
+		}
+};
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32Kernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32Kernels.cl
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32Kernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32Kernels.hlsl
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32KernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32KernelsCL.h
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32KernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32KernelsDX11.h
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortAdvancedKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortAdvancedKernels.hlsl
@@ -0,0 +1,985 @@
+/*
+		2011 Takahiro Harada
+*/
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
+#define AtomInc(x) InterlockedAdd(x, 1)
+#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
+
+#define min2 min
+#define max2 max
+
+
+cbuffer CB0 : register( b0 )
+{
+	int m_startBit;
+	int m_totalBlocks;
+	int m_nWorkGroupsToExecute;
+	int m_nBlocksPerGroup;
+
+};
+
+
+typedef struct {
+    unsigned int key;
+    unsigned int value;
+} KeyValuePair;
+
+
+StructuredBuffer<u32> rHistogram : register(t0);
+
+RWStructuredBuffer<KeyValuePair> dataToSort : register( u0 );
+RWStructuredBuffer<KeyValuePair> dataToSortOut : register( u1 );
+
+
+
+#define WG_SIZE 128
+#define ELEMENTS_PER_WORK_ITEM 4
+#define BITS_PER_PASS 4
+#define NUM_BUCKET (1<<BITS_PER_PASS)
+
+
+groupshared u32 sorterSharedMemory[max(WG_SIZE*2*2, WG_SIZE*ELEMENTS_PER_WORK_ITEM*2)];
+groupshared u32 localHistogramToCarry[NUM_BUCKET];
+groupshared u32 localHistogram[NUM_BUCKET*2];
+groupshared u32 localHistogramMat[NUM_BUCKET*WG_SIZE];
+groupshared u32 localPrefixSum[NUM_BUCKET];
+
+
+
+#define SET_LOCAL_SORT_DATA(idx, sortDataIn) sorterSharedMemory[2*(idx)+0] = sortDataIn.key; sorterSharedMemory[2*(idx)+1] = sortDataIn.value; 
+#define GET_LOCAL_SORT_DATA(idx, sortDataOut) sortDataOut.key = sorterSharedMemory[2*(idx)+0]; sortDataOut.value = sorterSharedMemory[2*(idx)+1];
+
+
+
+uint4 prefixScanVector( uint4 data )
+{
+	data.y += data.x;
+	data.w += data.z;
+	data.z += data.y;
+	data.w += data.y;
+	return data;
+}
+
+uint prefixScanVectorEx( inout uint4 data )
+{
+	uint4 backup = data;
+	data.y += data.x;
+	data.w += data.z;
+	data.z += data.y;
+	data.w += data.y;
+	uint sum = data.w;
+	data -= backup;
+	return sum;
+}
+
+uint localPrefixScan128( uint pData, uint lIdx, inout uint totalSum )
+{
+	{	//	Set data
+		sorterSharedMemory[lIdx] = 0;
+		sorterSharedMemory[lIdx+WG_SIZE] = pData;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	Prefix sum
+		int idx = 2*lIdx + (WG_SIZE+1);
+		if( lIdx < 64 )
+		{
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];					
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];			
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
+		}
+		if( lIdx < 64 ) sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
+	}
+
+	GROUP_LDS_BARRIER;
+
+	totalSum = sorterSharedMemory[WG_SIZE*2-1];
+	return sorterSharedMemory[lIdx+127];
+}
+
+void localPrefixScan128Dual( uint pData0, uint pData1, uint lIdx, 
+							inout uint rank0, inout uint rank1,
+							inout uint totalSum0, inout uint totalSum1 )
+{
+	{	//	Set data
+		sorterSharedMemory[lIdx] = 0;
+		sorterSharedMemory[lIdx+WG_SIZE] = pData0;
+		sorterSharedMemory[2*WG_SIZE+lIdx] = 0;
+		sorterSharedMemory[2*WG_SIZE+lIdx+WG_SIZE] = pData1;
+	}
+
+	GROUP_LDS_BARRIER;
+
+//	if( lIdx < 128 ) // todo. assert wg size is 128
+	{	//	Prefix sum
+		int blockIdx = lIdx/64;
+		int groupIdx = lIdx%64;
+		int idx = 2*groupIdx + (WG_SIZE+1) + (2*WG_SIZE)*blockIdx;
+
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-2];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
+
+		sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
+	}
+
+	GROUP_LDS_BARRIER;
+
+	totalSum0 = sorterSharedMemory[WG_SIZE*2-1];
+	rank0 = sorterSharedMemory[lIdx+127];
+	totalSum1 = sorterSharedMemory[2*WG_SIZE+WG_SIZE*2-1];
+	rank1 = sorterSharedMemory[2*WG_SIZE+lIdx+127];
+}
+
+uint4 localPrefixSum128V( uint4 pData, uint lIdx, inout uint totalSum )
+{
+	{	//	Set data
+		sorterSharedMemory[lIdx] = 0;
+		sorterSharedMemory[lIdx+WG_SIZE] = prefixScanVectorEx( pData );
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	Prefix sum
+		int idx = 2*lIdx + (WG_SIZE+1);
+		if( lIdx < 64 )
+		{
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];					
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];			
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
+
+			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
+		}
+	}
+
+	GROUP_LDS_BARRIER;
+
+	totalSum = sorterSharedMemory[WG_SIZE*2-1];
+	uint addValue = sorterSharedMemory[lIdx+127];
+	return pData + uint4(addValue, addValue, addValue, addValue);
+}
+
+void localPrefixSum128Dual( uint4 pData0, uint4 pData1, uint lIdx, 
+						   inout uint4 dataOut0, inout uint4 dataOut1, 
+						   inout uint totalSum0, inout uint totalSum1 )
+{
+/*
+	dataOut0 = localPrefixSum128V( pData0, lIdx, totalSum0 );
+	GROUP_LDS_BARRIER;
+	dataOut1 = localPrefixSum128V( pData1, lIdx, totalSum1 );
+	return;
+*/
+
+	uint4 backup0 = pData0;
+	uint4 backup1 = pData1;
+
+	{	// Prefix sum in a vector
+		pData0 = prefixScanVector( pData0 );
+		pData1 = prefixScanVector( pData1 );
+	}
+
+	{	//	Set data
+		sorterSharedMemory[lIdx] = 0;
+		sorterSharedMemory[lIdx+WG_SIZE] = pData0.w;
+		sorterSharedMemory[2*WG_SIZE+lIdx] = 0;
+		sorterSharedMemory[2*WG_SIZE+lIdx+WG_SIZE] = pData1.w;
+	}
+
+	GROUP_LDS_BARRIER;
+
+//	if( lIdx < 128 ) // todo. assert wg size is 128
+	{	//	Prefix sum
+		int blockIdx = lIdx/64;
+		int groupIdx = lIdx%64;
+		int idx = 2*groupIdx + (WG_SIZE+1) + (2*WG_SIZE)*blockIdx;
+
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-2];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
+
+		sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
+	}
+
+	GROUP_LDS_BARRIER;
+
+	totalSum0 = sorterSharedMemory[WG_SIZE*2-1];
+	{
+		uint addValue = sorterSharedMemory[lIdx+127];
+		dataOut0 = pData0 + uint4(addValue, addValue, addValue, addValue) - backup0;
+	}
+
+	totalSum1 = sorterSharedMemory[2*WG_SIZE+WG_SIZE*2-1];
+	{
+		uint addValue = sorterSharedMemory[2*WG_SIZE+lIdx+127];
+		dataOut1 = pData1 + uint4(addValue, addValue, addValue, addValue) - backup1;
+	}
+}
+
+uint4 extractKeys(uint4 data, uint targetKey)
+{
+	uint4 key;
+	key.x = data.x == targetKey ? 1:0;
+	key.y = data.y == targetKey ? 1:0;
+	key.z = data.z == targetKey ? 1:0;
+	key.w = data.w == targetKey ? 1:0;
+	return key;
+}
+
+uint4 extractKeysByBits(uint4 data, uint targetKey)
+{
+	uint4 key;
+	uint mask = 1<<targetKey;
+	key.x = (data.x & mask) >> targetKey;
+	key.y = (data.y & mask) >> targetKey;
+	key.z = (data.z & mask) >> targetKey;
+	key.w = (data.w & mask) >> targetKey;
+	return key;
+}
+
+uint packKeys(uint lower, uint upper)
+{
+	return lower|(upper<<16);
+}
+
+uint4 packKeys(uint4 lower, uint4 upper)
+{
+	return uint4( lower.x|(upper.x<<16), lower.y|(upper.y<<16), lower.z|(upper.z<<16), lower.w|(upper.w<<16) );
+}
+
+uint extractLower( uint data )
+{
+	return data&0xffff;
+}
+
+uint extractUpper( uint data )
+{
+	return (data>>16)&0xffff;
+}
+
+uint4 extractLower( uint4 data )
+{
+	return uint4( data.x&0xffff, data.y&0xffff, data.z&0xffff, data.w&0xffff );
+}
+
+uint4 extractUpper( uint4 data )
+{
+	return uint4( (data.x>>16)&0xffff, (data.y>>16)&0xffff, (data.z>>16)&0xffff, (data.w>>16)&0xffff );
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void SortAndScatterKernel( DEFAULT_ARGS )        
+{
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+
+	if( lIdx < (NUM_BUCKET) )
+	{
+		localHistogramToCarry[lIdx] = rHistogram[lIdx*m_nWorkGroupsToExecute + wgIdx];
+	}
+
+	GROUP_LDS_BARRIER;
+
+	for(uint igroup=wgIdx*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx+1)*m_nBlocksPerGroup); igroup++)
+	{
+		u32 myHistogram;
+		if( lIdx < (NUM_BUCKET) )
+		{
+			localPrefixSum[lIdx] = 0.f;
+		}
+
+		u32 newOffset[4];
+		KeyValuePair myData[4];
+		{	//	read data
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+			uint startAddress = igroup*numLocalElements + lIdx*4;
+
+			myData[0] = dataToSort[startAddress+0];
+			myData[1] = dataToSort[startAddress+1];
+			myData[2] = dataToSort[startAddress+2];
+			myData[3] = dataToSort[startAddress+3];
+
+			newOffset[0] = newOffset[1] = newOffset[2] = newOffset[3] = 0;
+		}
+
+		int localOffset = 0;
+		uint4 b = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);
+		for(uint targetKey=0; targetKey<(NUM_BUCKET); targetKey+=4)
+		{
+			uint4 key[4];
+			uint keySet[2];
+			{	//	pack 4
+				uint4 scannedKey[4];
+				key[0] = scannedKey[0] = extractKeys( b, targetKey+0 );
+				key[1] = scannedKey[1] = extractKeys( b, targetKey+1 );
+				key[2] = scannedKey[2] = extractKeys( b, targetKey+2 );
+				key[3] = scannedKey[3] = extractKeys( b, targetKey+3 );
+				{
+					uint s[4];
+					s[0] = prefixScanVectorEx( scannedKey[0] );
+					s[1] = prefixScanVectorEx( scannedKey[1] );
+					s[2] = prefixScanVectorEx( scannedKey[2] );
+					s[3] = prefixScanVectorEx( scannedKey[3] );
+					keySet[0] = packKeys( s[0], s[1] );
+					keySet[1] = packKeys( s[2], s[3] );
+				}
+			}
+
+			uint dstAddressBase[4];
+			{
+
+				uint totalSumPacked[2];
+				uint dstAddressPacked[2];
+
+				localPrefixScan128Dual( keySet[0], keySet[1], lIdx, dstAddressPacked[0], dstAddressPacked[1], totalSumPacked[0], totalSumPacked[1] );
+
+				dstAddressBase[0] = extractLower( dstAddressPacked[0] );
+				dstAddressBase[1] = extractUpper( dstAddressPacked[0] );
+				dstAddressBase[2] = extractLower( dstAddressPacked[1] );
+				dstAddressBase[3] = extractUpper( dstAddressPacked[1] );
+
+				uint4 histogram;
+				histogram.x = extractLower(totalSumPacked[0]);
+				histogram.y = extractUpper(totalSumPacked[0]);
+				histogram.z = extractLower(totalSumPacked[1]);
+				histogram.w = extractUpper(totalSumPacked[1]);
+
+				if( lIdx == targetKey + 0 ) myHistogram = histogram.x;
+				else if( lIdx == targetKey + 1 ) myHistogram = histogram.y;
+				else if( lIdx == targetKey + 2 ) myHistogram = histogram.z;
+				else if( lIdx == targetKey + 3 ) myHistogram = histogram.w;
+				
+				uint histogramSum = prefixScanVectorEx( histogram );
+
+				if( lIdx == targetKey + 0 ) localPrefixSum[targetKey+0] = localOffset+histogram.x;
+				else if( lIdx == targetKey + 1 ) localPrefixSum[targetKey+1] = localOffset+histogram.y;
+				else if( lIdx == targetKey + 2 ) localPrefixSum[targetKey+2] = localOffset+histogram.z;
+				else if( lIdx == targetKey + 3 ) localPrefixSum[targetKey+3] = localOffset+histogram.w;
+
+				localOffset += histogramSum;
+			}
+			
+			GROUP_LDS_BARRIER;
+
+
+			for(int ie=0; ie<4; ie++)
+			{
+				uint4 scannedKey = key[ie];
+				prefixScanVectorEx( scannedKey );
+
+				uint offset = localPrefixSum[targetKey + ie] + dstAddressBase[ie];
+				uint4 dstAddress = uint4( offset, offset, offset, offset ) + scannedKey;
+
+				newOffset[0] += dstAddress.x*key[ie].x;
+				newOffset[1] += dstAddress.y*key[ie].y;
+				newOffset[2] += dstAddress.z*key[ie].z;
+				newOffset[3] += dstAddress.w*key[ie].w;
+			}
+		}
+
+		{	//	local scatter
+			SET_LOCAL_SORT_DATA(newOffset[0], myData[0]);
+			SET_LOCAL_SORT_DATA(newOffset[1], myData[1]);
+			SET_LOCAL_SORT_DATA(newOffset[2], myData[2]);
+			SET_LOCAL_SORT_DATA(newOffset[3], myData[3]);
+		}
+
+		GROUP_LDS_BARRIER;
+
+		{	//	write data
+			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+			{
+				int dataIdx = 4*lIdx+i;
+				KeyValuePair localData; GET_LOCAL_SORT_DATA( dataIdx, localData );
+				int binIdx = (localData.key >> m_startBit) & 0xf;
+				int groupOffset = localHistogramToCarry[binIdx];
+				int myIdx = dataIdx - localPrefixSum[binIdx];
+
+				dataToSortOut[ groupOffset + myIdx ] = localData;
+			}
+		}
+
+		GROUP_LDS_BARRIER;
+		if( lIdx < NUM_BUCKET )
+		{
+			localHistogramToCarry[lIdx] += myHistogram;
+		}
+		GROUP_LDS_BARRIER;
+	}
+}
+
+
+[numthreads(WG_SIZE, 1, 1)]
+void SortAndScatterKernel1( DEFAULT_ARGS )
+{
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+
+	if( lIdx < (NUM_BUCKET) )
+	{
+		localHistogramToCarry[lIdx] = rHistogram[lIdx*m_nWorkGroupsToExecute + wgIdx.x];
+	}
+
+	GROUP_LDS_BARRIER;
+
+	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)
+	{
+		u32 myHistogram;
+
+		KeyValuePair myData[4];
+		uint startAddrBlock;
+		{	//	read data
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+			startAddrBlock = lIdx*4;
+			uint startAddress = igroup*numLocalElements + startAddrBlock;
+
+			myData[0] = dataToSort[startAddress+0];
+			myData[1] = dataToSort[startAddress+1];
+			myData[2] = dataToSort[startAddress+2];
+			myData[3] = dataToSort[startAddress+3];
+		}
+
+		//	local sort
+		for(int ib=m_startBit; ib<m_startBit+BITS_PER_PASS; ib++)
+		{
+			uint4 keys = uint4(~(myData[0].key>>ib) & 0x1, ~(myData[1].key>>ib) & 0x1, ~(myData[2].key>>ib) & 0x1, ~(myData[3].key>>ib) & 0x1);
+			uint total;
+			uint4 rankOfP = localPrefixSum128V( keys, lIdx, total );
+			uint4 rankOfN = uint4(startAddrBlock, startAddrBlock+1, startAddrBlock+2, startAddrBlock+3) - rankOfP + uint4( total, total, total, total );
+
+			uint4 myAddr = (keys==uint4(1,1,1,1))? rankOfP: rankOfN;
+			
+			GROUP_LDS_BARRIER;
+
+			SET_LOCAL_SORT_DATA( myAddr.x, myData[0] );
+			SET_LOCAL_SORT_DATA( myAddr.y, myData[1] );
+			SET_LOCAL_SORT_DATA( myAddr.z, myData[2] );
+			SET_LOCAL_SORT_DATA( myAddr.w, myData[3] );
+
+			GROUP_LDS_BARRIER;
+			
+			GET_LOCAL_SORT_DATA( startAddrBlock+0, myData[0] );
+			GET_LOCAL_SORT_DATA( startAddrBlock+1, myData[1] );
+			GET_LOCAL_SORT_DATA( startAddrBlock+2, myData[2] );
+			GET_LOCAL_SORT_DATA( startAddrBlock+3, myData[3] );
+		}
+
+		{//	create histogram -> prefix sum
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[lIdx] = 0;
+				localHistogram[NUM_BUCKET+lIdx] = 0;
+			}
+			GROUP_LDS_BARRIER;
+			uint4 keys = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);
+			
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.x], 1 );
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.y], 1 );
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.z], 1 );
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.w], 1 );
+			
+			GROUP_LDS_BARRIER;
+			
+			uint hIdx = NUM_BUCKET+lIdx;
+			if( lIdx < NUM_BUCKET )
+			{
+				myHistogram = localHistogram[hIdx];
+			}
+			GROUP_LDS_BARRIER;
+	
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[hIdx] = localHistogram[hIdx-1];
+
+				localHistogram[hIdx] += localHistogram[hIdx-1];
+				localHistogram[hIdx] += localHistogram[hIdx-2];
+				localHistogram[hIdx] += localHistogram[hIdx-4];
+				localHistogram[hIdx] += localHistogram[hIdx-8];
+			}
+
+			GROUP_LDS_BARRIER;
+		}
+/*
+		{//	write back
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+			startAddrBlock = lIdx*4;
+			uint startAddress = igroup*numLocalElements + startAddrBlock;
+
+			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
+			{
+				dataToSortOut[ startAddress+ie ] = myData[ie];
+			}
+		}
+*/
+		{
+			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
+			{
+				int dataIdx = startAddrBlock+ie;
+				int binIdx = (myData[ie].key>>m_startBit)&0xf;
+				int groupOffset = localHistogramToCarry[binIdx];
+				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
+				dataToSortOut[ groupOffset + myIdx ] = myData[ie];
+			}
+		}
+		
+		GROUP_LDS_BARRIER;
+		if( lIdx < NUM_BUCKET )
+		{
+			localHistogramToCarry[lIdx] += myHistogram;
+		}
+		GROUP_LDS_BARRIER;
+	
+	}
+}
+
+/*
+[numthreads(WG_SIZE, 1, 1)]
+void SortAndScatterKernel1( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )
+{
+	if( lIdx.x < (NUM_BUCKET) )
+	{
+		localHistogramToCarry[lIdx.x] = rHistogram[lIdx.x*m_nWorkGroupsToExecute + gIdx.x];
+	}
+
+	GROUP_LDS_BARRIER;
+
+	for(uint igroup=gIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(gIdx.x+1)*m_nBlocksPerGroup); igroup++)
+	{
+		u32 myHistogram;
+
+		KeyValuePair myData[4];
+		uint startAddrBlock;
+		{	//	read data
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+			startAddrBlock = lIdx.x*4;
+			uint startAddress = igroup*numLocalElements + startAddrBlock;
+
+			myData[0] = dataToSort[startAddress+0];
+			myData[1] = dataToSort[startAddress+1];
+			myData[2] = dataToSort[startAddress+2];
+			myData[3] = dataToSort[startAddress+3];
+		}
+
+		for(int ib=m_startBit; ib<m_startBit+BITS_PER_PASS; ib++)
+		{
+			uint4 keys = uint4(~(myData[0].key>>ib) & 0x1, ~(myData[1].key>>ib) & 0x1, ~(myData[2].key>>ib) & 0x1, ~(myData[3].key>>ib) & 0x1);
+			uint total;
+			uint4 rankOfP = localPrefixSum128V( keys, lIdx.x, total );
+			uint4 rankOfN = uint4(startAddrBlock, startAddrBlock+1, startAddrBlock+2, startAddrBlock+3) - rankOfP + uint4( total, total, total, total );
+
+			uint4 myAddr = (keys==uint4(1,1,1,1))? rankOfP: rankOfN;
+			
+			GROUP_LDS_BARRIER;
+
+			SET_LOCAL_SORT_DATA( myAddr.x, myData[0] );
+			SET_LOCAL_SORT_DATA( myAddr.y, myData[1] );
+			SET_LOCAL_SORT_DATA( myAddr.z, myData[2] );
+			SET_LOCAL_SORT_DATA( myAddr.w, myData[3] );
+
+			GROUP_LDS_BARRIER;
+			
+			GET_LOCAL_SORT_DATA( startAddrBlock+0, myData[0] );
+			GET_LOCAL_SORT_DATA( startAddrBlock+1, myData[1] );
+			GET_LOCAL_SORT_DATA( startAddrBlock+2, myData[2] );
+			GET_LOCAL_SORT_DATA( startAddrBlock+3, myData[3] );
+		}
+		
+		{//	create histogram -> prefix sum
+			if( lIdx.x < NUM_BUCKET )
+			{
+				localHistogram[lIdx.x] = 0;
+				localHistogram[NUM_BUCKET+lIdx.x] = 0;
+			}
+			GROUP_LDS_BARRIER;
+			uint4 keys = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);
+			
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.x], 1 );
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.y], 1 );
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.z], 1 );
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.w], 1 );
+			
+			GROUP_LDS_BARRIER;
+			
+			uint hIdx = NUM_BUCKET+lIdx.x;
+			if( lIdx.x < NUM_BUCKET )
+			{
+				myHistogram = localHistogram[hIdx];
+			}
+			GROUP_LDS_BARRIER;
+	
+
+			if( lIdx.x < NUM_BUCKET )
+			{
+				localHistogram[hIdx] = localHistogram[hIdx-1];
+
+				localHistogram[hIdx] += localHistogram[hIdx-1];
+				localHistogram[hIdx] += localHistogram[hIdx-2];
+				localHistogram[hIdx] += localHistogram[hIdx-4];
+				localHistogram[hIdx] += localHistogram[hIdx-8];
+			}
+
+			GROUP_LDS_BARRIER;
+		}
+		{//	write back
+			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
+			{
+				int dataIdx = startAddrBlock+ie;
+				int binIdx = (myData[ie].key>>m_startBit)&0xf;
+				int groupOffset = localHistogramToCarry[binIdx];
+				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
+				
+				dataToSortOut[ groupOffset + myIdx ] = myData[ie];
+			}
+		}
+		
+		GROUP_LDS_BARRIER;
+		if( lIdx.x < NUM_BUCKET )
+		{
+			localHistogramToCarry[lIdx.x] += myHistogram;
+		}
+		GROUP_LDS_BARRIER;
+	
+	}
+}
+*/
+
+StructuredBuffer<KeyValuePair> dataToSort1 : register( t0 );
+RWStructuredBuffer<u32> wHistogram1 : register(u0);
+
+#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx.x]
+
+[numthreads(WG_SIZE, 1, 1)]
+void StreamCountKernel( DEFAULT_ARGS )        
+{
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+
+	int myHistogram[NUM_BUCKET];
+
+	for(int i=0; i<NUM_BUCKET; i++)
+	{
+		MY_HISTOGRAM(i) = 0;
+	}
+
+	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)
+	{
+		uint localKeys[4];
+		{	//	read data
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+
+			uint4 localAddress = uint4(lIdx, lIdx, lIdx, lIdx)*4+uint4(0,1,2,3);
+			uint4 globalAddress = uint4(igroup,igroup,igroup,igroup)*numLocalElements + localAddress;
+
+			KeyValuePair localData0 = dataToSort1[globalAddress.x];
+			KeyValuePair localData1 = dataToSort1[globalAddress.y];
+			KeyValuePair localData2 = dataToSort1[globalAddress.z];
+			KeyValuePair localData3 = dataToSort1[globalAddress.w];
+
+			localKeys[0] = (localData0.key >> m_startBit) & 0xf;
+			localKeys[1] = (localData1.key >> m_startBit) & 0xf;
+			localKeys[2] = (localData2.key >> m_startBit) & 0xf;
+			localKeys[3] = (localData3.key >> m_startBit) & 0xf;
+		}
+
+		MY_HISTOGRAM( localKeys[0] )++;
+		MY_HISTOGRAM( localKeys[1] )++;
+		MY_HISTOGRAM( localKeys[2] )++;
+		MY_HISTOGRAM( localKeys[3] )++;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	reduce to 1
+		if( lIdx < 64 )//WG_SIZE/2 )
+		{
+			for(int i=0; i<NUM_BUCKET/2; i++)
+			{
+				int idx = lIdx;
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];
+			}
+		}
+		else if( lIdx < 128 )
+		{
+			for(int i=NUM_BUCKET/2; i<NUM_BUCKET; i++)
+			{
+				int idx = lIdx-64;
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];
+			}
+		}
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	write data
+		if( lIdx < NUM_BUCKET )
+		{
+			wHistogram1[ lIdx*m_nWorkGroupsToExecute + wgIdx.x ] = localHistogramMat[ lIdx*WG_SIZE+0 ];
+		}
+	}
+}
+
+/*
+[numthreads(WG_SIZE, 1, 1)]
+void StreamCountKernel( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )        
+{
+	int myHistogram[NUM_BUCKET];
+
+	for(int i=0; i<NUM_BUCKET; i++)
+	{
+		myHistogram[i] = 0;
+	}
+
+	for(uint igroup=gIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(gIdx.x+1)*m_nBlocksPerGroup); igroup++)
+	{
+		uint localKeys[4];
+		{	//	read data
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+
+			uint4 localAddress = uint4(lIdx.x, lIdx.x, lIdx.x, lIdx.x)*4+uint4(0,1,2,3);
+			uint4 globalAddress = uint4(igroup,igroup,igroup,igroup)*numLocalElements + localAddress;
+
+			KeyValuePair localData0 = dataToSort1[globalAddress.x];
+			KeyValuePair localData1 = dataToSort1[globalAddress.y];
+			KeyValuePair localData2 = dataToSort1[globalAddress.z];
+			KeyValuePair localData3 = dataToSort1[globalAddress.w];
+
+			localKeys[0] = (localData0.key >> m_startBit) & 0xf;
+			localKeys[1] = (localData1.key >> m_startBit) & 0xf;
+			localKeys[2] = (localData2.key >> m_startBit) & 0xf;
+			localKeys[3] = (localData3.key >> m_startBit) & 0xf;
+		}
+
+		myHistogram[ localKeys[0] ]++;
+		myHistogram[ localKeys[1] ]++;
+		myHistogram[ localKeys[2] ]++;
+		myHistogram[ localKeys[3] ]++;
+	}
+
+	{	//	move to shared
+		for(int i=0; i<NUM_BUCKET; i++)
+		{
+			localHistogramMat[i*WG_SIZE+lIdx.x] = myHistogram[i];
+		}
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	reduce to 1
+		if( lIdx.x < 64 )//WG_SIZE/2 )
+		{
+			for(int i=0; i<NUM_BUCKET/2; i++)
+			{
+				int idx = lIdx.x;
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];
+			}
+		}
+		else if( lIdx.x < 128 )
+		{
+			for(int i=NUM_BUCKET/2; i<NUM_BUCKET; i++)
+			{
+				int idx = lIdx.x-64;
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];
+			}
+		}
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	write data
+		if( lIdx.x < NUM_BUCKET )
+		{
+			wHistogram1[ lIdx.x*m_nWorkGroupsToExecute + gIdx.x ] = localHistogramMat[ lIdx.x*WG_SIZE+0 ];
+		}
+	}
+}
+*/
+
+/*
+//	for MAX_WG_SIZE 20
+[numthreads(WG_SIZE, 1, 1)]
+void PrefixScanKernel( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )        
+{
+	uint4 myData = uint4(0,0,0,0);
+	if( 4*lIdx.x+0 < NUM_BUCKET*m_nWorkGroupsToExecute )
+		myData.x = wHistogram1[4*lIdx.x+0];
+	if( 4*lIdx.x+1 < NUM_BUCKET*m_nWorkGroupsToExecute )
+		myData.y = wHistogram1[4*lIdx.x+1];
+	if( 4*lIdx.x+2 < NUM_BUCKET*m_nWorkGroupsToExecute )
+		myData.z = wHistogram1[4*lIdx.x+2];
+	if( 4*lIdx.x+3 < NUM_BUCKET*m_nWorkGroupsToExecute )
+		myData.w = wHistogram1[4*lIdx.x+3];
+
+	uint totalSum;
+
+	uint4 scanned = localPrefixSum128V( myData, lIdx.x, totalSum );
+
+	wHistogram1[4*lIdx.x+0] = scanned.x;
+	wHistogram1[4*lIdx.x+1] = scanned.y;
+	wHistogram1[4*lIdx.x+2] = scanned.z;
+	wHistogram1[4*lIdx.x+3] = scanned.w;
+}
+*/
+
+//	for MAX_WG_SIZE 80
+//	can hold up to WG_SIZE*12 (128*12 > 80*16 )
+[numthreads(WG_SIZE, 1, 1)]
+void PrefixScanKernel( DEFAULT_ARGS )
+{
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+
+	uint data[12] = {0,0,0,0,0,0,0,0,0,0,0,0};
+	for(int i=0; i<12; i++)
+	{
+		if( int(12*lIdx+i) < NUM_BUCKET*m_nWorkGroupsToExecute )
+			data[i] = wHistogram1[12*lIdx+i];
+	}
+
+	uint4 myData = uint4(0,0,0,0);
+	myData.x = data[0] + data[1];
+	myData.y = data[2] + data[3];
+	myData.z = data[4] + data[5];
+	myData.w = data[6] + data[7];
+
+
+	uint totalSum;
+	uint4 scanned = localPrefixSum128V( myData, lIdx, totalSum );
+
+	data[11] = scanned.w + data[9] + data[10];
+	data[10] = scanned.w + data[9];
+	data[9] = scanned.w;
+	data[8] = scanned.z + data[6] + data[7];
+	data[7] = scanned.z + data[6];
+	data[6] = scanned.z;
+	data[5] = scanned.y + data[3] + data[4];
+	data[4] = scanned.y + data[3];
+	data[3] = scanned.y;
+	data[2] = scanned.x + data[0] + data[1];
+	data[1] = scanned.x + data[0];
+	data[0] = scanned.x;
+
+	for(int i=0; i<12; i++)
+	{
+		wHistogram1[12*lIdx+i] = data[i];
+	}
+}
+/*
+[numthreads(WG_SIZE, 1, 1)]
+void PrefixScanKernel( DEFAULT_ARGS )
+{
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+
+	uint data[8] = {0,0,0,0,0,0,0,0};
+	for(int i=0; i<8; i++)
+	{
+		if( int(8*lIdx+i) < NUM_BUCKET*m_nWorkGroupsToExecute )
+			data[i] = wHistogram1[8*lIdx+i];
+	}
+
+	uint4 myData = uint4(0,0,0,0);
+	myData.x = data[0] + data[1];
+	myData.y = data[2] + data[3];
+	myData.z = data[4] + data[5];
+	myData.w = data[6] + data[7];
+
+
+	uint totalSum;
+	uint4 scanned = localPrefixSum128V( myData, lIdx, totalSum );
+
+	data[7] = scanned.w + data[6];
+	data[6] = scanned.w;// + data[5];
+	data[5] = scanned.z + data[4];
+	data[4] = scanned.z;// + data[3];
+	data[3] = scanned.y + data[2];
+	data[2] = scanned.y;// + data[1];
+	data[1] = scanned.x + data[0];
+	data[0] = scanned.x;
+
+	for(int i=0; i<8; i++)
+	{
+		wHistogram1[8*lIdx+i] = data[i];
+	}
+}
+*/
+
+
+[numthreads(WG_SIZE, 1, 1)]
+void CopyKernel( DEFAULT_ARGS )
+{
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+
+	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)
+	{
+		KeyValuePair myData[4];
+		uint startAddrBlock;
+		{	//	read data
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+			startAddrBlock = lIdx*4;
+			uint startAddress = igroup*numLocalElements + startAddrBlock;
+
+			myData[0] = dataToSort[startAddress+0];
+			myData[1] = dataToSort[startAddress+1];
+			myData[2] = dataToSort[startAddress+2];
+			myData[3] = dataToSort[startAddress+3];
+		}
+
+		{
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+			uint startAddress = igroup*numLocalElements + startAddrBlock;
+
+			dataToSortOut[startAddress+0] = myData[0];
+			dataToSortOut[startAddress+1] = myData[1];
+			dataToSortOut[startAddress+2] = myData[2];
+			dataToSortOut[startAddress+3] = myData[3];
+		}
+	}
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortAdvancedKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortAdvancedKernelsDX11.h
@@ -0,0 +1,987 @@
+static const char* radixSortAdvancedKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+"\n"
+"#define min2 min\n"
+"#define max2 max\n"
+"\n"
+"\n"
+"cbuffer CB0 : register( b0 )\n"
+"{\n"
+"	int m_startBit;\n"
+"	int m_totalBlocks;\n"
+"	int m_nWorkGroupsToExecute;\n"
+"	int m_nBlocksPerGroup;\n"
+"\n"
+"};\n"
+"\n"
+"\n"
+"typedef struct {\n"
+"    unsigned int key;\n"
+"    unsigned int value;\n"
+"} KeyValuePair;\n"
+"\n"
+"\n"
+"StructuredBuffer<u32> rHistogram : register(t0);\n"
+"\n"
+"RWStructuredBuffer<KeyValuePair> dataToSort : register( u0 );\n"
+"RWStructuredBuffer<KeyValuePair> dataToSortOut : register( u1 );\n"
+"\n"
+"\n"
+"\n"
+"#define WG_SIZE 128\n"
+"#define ELEMENTS_PER_WORK_ITEM 4\n"
+"#define BITS_PER_PASS 4\n"
+"#define NUM_BUCKET (1<<BITS_PER_PASS)\n"
+"\n"
+"\n"
+"groupshared u32 sorterSharedMemory[max(WG_SIZE*2*2, WG_SIZE*ELEMENTS_PER_WORK_ITEM*2)];\n"
+"groupshared u32 localHistogramToCarry[NUM_BUCKET];\n"
+"groupshared u32 localHistogram[NUM_BUCKET*2];\n"
+"groupshared u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
+"groupshared u32 localPrefixSum[NUM_BUCKET];\n"
+"\n"
+"\n"
+"\n"
+"#define SET_LOCAL_SORT_DATA(idx, sortDataIn) sorterSharedMemory[2*(idx)+0] = sortDataIn.key; sorterSharedMemory[2*(idx)+1] = sortDataIn.value; \n"
+"#define GET_LOCAL_SORT_DATA(idx, sortDataOut) sortDataOut.key = sorterSharedMemory[2*(idx)+0]; sortDataOut.value = sorterSharedMemory[2*(idx)+1];\n"
+"\n"
+"\n"
+"\n"
+"uint4 prefixScanVector( uint4 data )\n"
+"{\n"
+"	data.y += data.x;\n"
+"	data.w += data.z;\n"
+"	data.z += data.y;\n"
+"	data.w += data.y;\n"
+"	return data;\n"
+"}\n"
+"\n"
+"uint prefixScanVectorEx( inout uint4 data )\n"
+"{\n"
+"	uint4 backup = data;\n"
+"	data.y += data.x;\n"
+"	data.w += data.z;\n"
+"	data.z += data.y;\n"
+"	data.w += data.y;\n"
+"	uint sum = data.w;\n"
+"	data -= backup;\n"
+"	return sum;\n"
+"}\n"
+"\n"
+"uint localPrefixScan128( uint pData, uint lIdx, inout uint totalSum )\n"
+"{\n"
+"	{	//	Set data\n"
+"		sorterSharedMemory[lIdx] = 0;\n"
+"		sorterSharedMemory[lIdx+WG_SIZE] = pData;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	Prefix sum\n"
+"		int idx = 2*lIdx + (WG_SIZE+1);\n"
+"		if( lIdx < 64 )\n"
+"		{\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];					\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];			\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+"		}\n"
+"		if( lIdx < 64 ) sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	totalSum = sorterSharedMemory[WG_SIZE*2-1];\n"
+"	return sorterSharedMemory[lIdx+127];\n"
+"}\n"
+"\n"
+"void localPrefixScan128Dual( uint pData0, uint pData1, uint lIdx, \n"
+"							inout uint rank0, inout uint rank1,\n"
+"							inout uint totalSum0, inout uint totalSum1 )\n"
+"{\n"
+"	{	//	Set data\n"
+"		sorterSharedMemory[lIdx] = 0;\n"
+"		sorterSharedMemory[lIdx+WG_SIZE] = pData0;\n"
+"		sorterSharedMemory[2*WG_SIZE+lIdx] = 0;\n"
+"		sorterSharedMemory[2*WG_SIZE+lIdx+WG_SIZE] = pData1;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"//	if( lIdx < 128 ) // todo. assert wg size is 128\n"
+"	{	//	Prefix sum\n"
+"		int blockIdx = lIdx/64;\n"
+"		int groupIdx = lIdx%64;\n"
+"		int idx = 2*groupIdx + (WG_SIZE+1) + (2*WG_SIZE)*blockIdx;\n"
+"\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-2];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+"\n"
+"		sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	totalSum0 = sorterSharedMemory[WG_SIZE*2-1];\n"
+"	rank0 = sorterSharedMemory[lIdx+127];\n"
+"	totalSum1 = sorterSharedMemory[2*WG_SIZE+WG_SIZE*2-1];\n"
+"	rank1 = sorterSharedMemory[2*WG_SIZE+lIdx+127];\n"
+"}\n"
+"\n"
+"uint4 localPrefixSum128V( uint4 pData, uint lIdx, inout uint totalSum )\n"
+"{\n"
+"	{	//	Set data\n"
+"		sorterSharedMemory[lIdx] = 0;\n"
+"		sorterSharedMemory[lIdx+WG_SIZE] = prefixScanVectorEx( pData );\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	Prefix sum\n"
+"		int idx = 2*lIdx + (WG_SIZE+1);\n"
+"		if( lIdx < 64 )\n"
+"		{\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];					\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];			\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+"\n"
+"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+"		}\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	totalSum = sorterSharedMemory[WG_SIZE*2-1];\n"
+"	uint addValue = sorterSharedMemory[lIdx+127];\n"
+"	return pData + uint4(addValue, addValue, addValue, addValue);\n"
+"}\n"
+"\n"
+"void localPrefixSum128Dual( uint4 pData0, uint4 pData1, uint lIdx, \n"
+"						   inout uint4 dataOut0, inout uint4 dataOut1, \n"
+"						   inout uint totalSum0, inout uint totalSum1 )\n"
+"{\n"
+"/*\n"
+"	dataOut0 = localPrefixSum128V( pData0, lIdx, totalSum0 );\n"
+"	GROUP_LDS_BARRIER;\n"
+"	dataOut1 = localPrefixSum128V( pData1, lIdx, totalSum1 );\n"
+"	return;\n"
+"*/\n"
+"\n"
+"	uint4 backup0 = pData0;\n"
+"	uint4 backup1 = pData1;\n"
+"\n"
+"	{	// Prefix sum in a vector\n"
+"		pData0 = prefixScanVector( pData0 );\n"
+"		pData1 = prefixScanVector( pData1 );\n"
+"	}\n"
+"\n"
+"	{	//	Set data\n"
+"		sorterSharedMemory[lIdx] = 0;\n"
+"		sorterSharedMemory[lIdx+WG_SIZE] = pData0.w;\n"
+"		sorterSharedMemory[2*WG_SIZE+lIdx] = 0;\n"
+"		sorterSharedMemory[2*WG_SIZE+lIdx+WG_SIZE] = pData1.w;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"//	if( lIdx < 128 ) // todo. assert wg size is 128\n"
+"	{	//	Prefix sum\n"
+"		int blockIdx = lIdx/64;\n"
+"		int groupIdx = lIdx%64;\n"
+"		int idx = 2*groupIdx + (WG_SIZE+1) + (2*WG_SIZE)*blockIdx;\n"
+"\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-2];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+"\n"
+"		sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	totalSum0 = sorterSharedMemory[WG_SIZE*2-1];\n"
+"	{\n"
+"		uint addValue = sorterSharedMemory[lIdx+127];\n"
+"		dataOut0 = pData0 + uint4(addValue, addValue, addValue, addValue) - backup0;\n"
+"	}\n"
+"\n"
+"	totalSum1 = sorterSharedMemory[2*WG_SIZE+WG_SIZE*2-1];\n"
+"	{\n"
+"		uint addValue = sorterSharedMemory[2*WG_SIZE+lIdx+127];\n"
+"		dataOut1 = pData1 + uint4(addValue, addValue, addValue, addValue) - backup1;\n"
+"	}\n"
+"}\n"
+"\n"
+"uint4 extractKeys(uint4 data, uint targetKey)\n"
+"{\n"
+"	uint4 key;\n"
+"	key.x = data.x == targetKey ? 1:0;\n"
+"	key.y = data.y == targetKey ? 1:0;\n"
+"	key.z = data.z == targetKey ? 1:0;\n"
+"	key.w = data.w == targetKey ? 1:0;\n"
+"	return key;\n"
+"}\n"
+"\n"
+"uint4 extractKeysByBits(uint4 data, uint targetKey)\n"
+"{\n"
+"	uint4 key;\n"
+"	uint mask = 1<<targetKey;\n"
+"	key.x = (data.x & mask) >> targetKey;\n"
+"	key.y = (data.y & mask) >> targetKey;\n"
+"	key.z = (data.z & mask) >> targetKey;\n"
+"	key.w = (data.w & mask) >> targetKey;\n"
+"	return key;\n"
+"}\n"
+"\n"
+"uint packKeys(uint lower, uint upper)\n"
+"{\n"
+"	return lower|(upper<<16);\n"
+"}\n"
+"\n"
+"uint4 packKeys(uint4 lower, uint4 upper)\n"
+"{\n"
+"	return uint4( lower.x|(upper.x<<16), lower.y|(upper.y<<16), lower.z|(upper.z<<16), lower.w|(upper.w<<16) );\n"
+"}\n"
+"\n"
+"uint extractLower( uint data )\n"
+"{\n"
+"	return data&0xffff;\n"
+"}\n"
+"\n"
+"uint extractUpper( uint data )\n"
+"{\n"
+"	return (data>>16)&0xffff;\n"
+"}\n"
+"\n"
+"uint4 extractLower( uint4 data )\n"
+"{\n"
+"	return uint4( data.x&0xffff, data.y&0xffff, data.z&0xffff, data.w&0xffff );\n"
+"}\n"
+"\n"
+"uint4 extractUpper( uint4 data )\n"
+"{\n"
+"	return uint4( (data.x>>16)&0xffff, (data.y>>16)&0xffff, (data.z>>16)&0xffff, (data.w>>16)&0xffff );\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void SortAndScatterKernel( DEFAULT_ARGS )        \n"
+"{\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"\n"
+"	if( lIdx < (NUM_BUCKET) )\n"
+"	{\n"
+"		localHistogramToCarry[lIdx] = rHistogram[lIdx*m_nWorkGroupsToExecute + wgIdx];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	for(uint igroup=wgIdx*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx+1)*m_nBlocksPerGroup); igroup++)\n"
+"	{\n"
+"		u32 myHistogram;\n"
+"		if( lIdx < (NUM_BUCKET) )\n"
+"		{\n"
+"			localPrefixSum[lIdx] = 0.f;\n"
+"		}\n"
+"\n"
+"		u32 newOffset[4];\n"
+"		KeyValuePair myData[4];\n"
+"		{	//	read data\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"			uint startAddress = igroup*numLocalElements + lIdx*4;\n"
+"\n"
+"			myData[0] = dataToSort[startAddress+0];\n"
+"			myData[1] = dataToSort[startAddress+1];\n"
+"			myData[2] = dataToSort[startAddress+2];\n"
+"			myData[3] = dataToSort[startAddress+3];\n"
+"\n"
+"			newOffset[0] = newOffset[1] = newOffset[2] = newOffset[3] = 0;\n"
+"		}\n"
+"\n"
+"		int localOffset = 0;\n"
+"		uint4 b = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);\n"
+"		for(uint targetKey=0; targetKey<(NUM_BUCKET); targetKey+=4)\n"
+"		{\n"
+"			uint4 key[4];\n"
+"			uint keySet[2];\n"
+"			{	//	pack 4\n"
+"				uint4 scannedKey[4];\n"
+"				key[0] = scannedKey[0] = extractKeys( b, targetKey+0 );\n"
+"				key[1] = scannedKey[1] = extractKeys( b, targetKey+1 );\n"
+"				key[2] = scannedKey[2] = extractKeys( b, targetKey+2 );\n"
+"				key[3] = scannedKey[3] = extractKeys( b, targetKey+3 );\n"
+"				{\n"
+"					uint s[4];\n"
+"					s[0] = prefixScanVectorEx( scannedKey[0] );\n"
+"					s[1] = prefixScanVectorEx( scannedKey[1] );\n"
+"					s[2] = prefixScanVectorEx( scannedKey[2] );\n"
+"					s[3] = prefixScanVectorEx( scannedKey[3] );\n"
+"					keySet[0] = packKeys( s[0], s[1] );\n"
+"					keySet[1] = packKeys( s[2], s[3] );\n"
+"				}\n"
+"			}\n"
+"\n"
+"			uint dstAddressBase[4];\n"
+"			{\n"
+"\n"
+"				uint totalSumPacked[2];\n"
+"				uint dstAddressPacked[2];\n"
+"\n"
+"				localPrefixScan128Dual( keySet[0], keySet[1], lIdx, dstAddressPacked[0], dstAddressPacked[1], totalSumPacked[0], totalSumPacked[1] );\n"
+"\n"
+"				dstAddressBase[0] = extractLower( dstAddressPacked[0] );\n"
+"				dstAddressBase[1] = extractUpper( dstAddressPacked[0] );\n"
+"				dstAddressBase[2] = extractLower( dstAddressPacked[1] );\n"
+"				dstAddressBase[3] = extractUpper( dstAddressPacked[1] );\n"
+"\n"
+"				uint4 histogram;\n"
+"				histogram.x = extractLower(totalSumPacked[0]);\n"
+"				histogram.y = extractUpper(totalSumPacked[0]);\n"
+"				histogram.z = extractLower(totalSumPacked[1]);\n"
+"				histogram.w = extractUpper(totalSumPacked[1]);\n"
+"\n"
+"				if( lIdx == targetKey + 0 ) myHistogram = histogram.x;\n"
+"				else if( lIdx == targetKey + 1 ) myHistogram = histogram.y;\n"
+"				else if( lIdx == targetKey + 2 ) myHistogram = histogram.z;\n"
+"				else if( lIdx == targetKey + 3 ) myHistogram = histogram.w;\n"
+"				\n"
+"				uint histogramSum = prefixScanVectorEx( histogram );\n"
+"\n"
+"				if( lIdx == targetKey + 0 ) localPrefixSum[targetKey+0] = localOffset+histogram.x;\n"
+"				else if( lIdx == targetKey + 1 ) localPrefixSum[targetKey+1] = localOffset+histogram.y;\n"
+"				else if( lIdx == targetKey + 2 ) localPrefixSum[targetKey+2] = localOffset+histogram.z;\n"
+"				else if( lIdx == targetKey + 3 ) localPrefixSum[targetKey+3] = localOffset+histogram.w;\n"
+"\n"
+"				localOffset += histogramSum;\n"
+"			}\n"
+"			\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"\n"
+"			for(int ie=0; ie<4; ie++)\n"
+"			{\n"
+"				uint4 scannedKey = key[ie];\n"
+"				prefixScanVectorEx( scannedKey );\n"
+"\n"
+"				uint offset = localPrefixSum[targetKey + ie] + dstAddressBase[ie];\n"
+"				uint4 dstAddress = uint4( offset, offset, offset, offset ) + scannedKey;\n"
+"\n"
+"				newOffset[0] += dstAddress.x*key[ie].x;\n"
+"				newOffset[1] += dstAddress.y*key[ie].y;\n"
+"				newOffset[2] += dstAddress.z*key[ie].z;\n"
+"				newOffset[3] += dstAddress.w*key[ie].w;\n"
+"			}\n"
+"		}\n"
+"\n"
+"		{	//	local scatter\n"
+"			SET_LOCAL_SORT_DATA(newOffset[0], myData[0]);\n"
+"			SET_LOCAL_SORT_DATA(newOffset[1], myData[1]);\n"
+"			SET_LOCAL_SORT_DATA(newOffset[2], myData[2]);\n"
+"			SET_LOCAL_SORT_DATA(newOffset[3], myData[3]);\n"
+"		}\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		{	//	write data\n"
+"			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"			{\n"
+"				int dataIdx = 4*lIdx+i;\n"
+"				KeyValuePair localData; GET_LOCAL_SORT_DATA( dataIdx, localData );\n"
+"				int binIdx = (localData.key >> m_startBit) & 0xf;\n"
+"				int groupOffset = localHistogramToCarry[binIdx];\n"
+"				int myIdx = dataIdx - localPrefixSum[binIdx];\n"
+"\n"
+"				dataToSortOut[ groupOffset + myIdx ] = localData;\n"
+"			}\n"
+"		}\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"		if( lIdx < NUM_BUCKET )\n"
+"		{\n"
+"			localHistogramToCarry[lIdx] += myHistogram;\n"
+"		}\n"
+"		GROUP_LDS_BARRIER;\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void SortAndScatterKernel1( DEFAULT_ARGS )\n"
+"{\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"\n"
+"	if( lIdx < (NUM_BUCKET) )\n"
+"	{\n"
+"		localHistogramToCarry[lIdx] = rHistogram[lIdx*m_nWorkGroupsToExecute + wgIdx.x];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
+"	{\n"
+"		u32 myHistogram;\n"
+"\n"
+"		KeyValuePair myData[4];\n"
+"		uint startAddrBlock;\n"
+"		{	//	read data\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"			startAddrBlock = lIdx*4;\n"
+"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
+"\n"
+"			myData[0] = dataToSort[startAddress+0];\n"
+"			myData[1] = dataToSort[startAddress+1];\n"
+"			myData[2] = dataToSort[startAddress+2];\n"
+"			myData[3] = dataToSort[startAddress+3];\n"
+"		}\n"
+"\n"
+"		//	local sort\n"
+"		for(int ib=m_startBit; ib<m_startBit+BITS_PER_PASS; ib++)\n"
+"		{\n"
+"			uint4 keys = uint4(~(myData[0].key>>ib) & 0x1, ~(myData[1].key>>ib) & 0x1, ~(myData[2].key>>ib) & 0x1, ~(myData[3].key>>ib) & 0x1);\n"
+"			uint total;\n"
+"			uint4 rankOfP = localPrefixSum128V( keys, lIdx, total );\n"
+"			uint4 rankOfN = uint4(startAddrBlock, startAddrBlock+1, startAddrBlock+2, startAddrBlock+3) - rankOfP + uint4( total, total, total, total );\n"
+"\n"
+"			uint4 myAddr = (keys==uint4(1,1,1,1))? rankOfP: rankOfN;\n"
+"			\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			SET_LOCAL_SORT_DATA( myAddr.x, myData[0] );\n"
+"			SET_LOCAL_SORT_DATA( myAddr.y, myData[1] );\n"
+"			SET_LOCAL_SORT_DATA( myAddr.z, myData[2] );\n"
+"			SET_LOCAL_SORT_DATA( myAddr.w, myData[3] );\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"			\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+0, myData[0] );\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+1, myData[1] );\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+2, myData[2] );\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+3, myData[3] );\n"
+"		}\n"
+"\n"
+"		{//	create histogram -> prefix sum\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[lIdx] = 0;\n"
+"				localHistogram[NUM_BUCKET+lIdx] = 0;\n"
+"			}\n"
+"			GROUP_LDS_BARRIER;\n"
+"			uint4 keys = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);\n"
+"			\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.x], 1 );\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.y], 1 );\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.z], 1 );\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.w], 1 );\n"
+"			\n"
+"			GROUP_LDS_BARRIER;\n"
+"			\n"
+"			uint hIdx = NUM_BUCKET+lIdx;\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				myHistogram = localHistogram[hIdx];\n"
+"			}\n"
+"			GROUP_LDS_BARRIER;\n"
+"	\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+"\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
+"			}\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"		}\n"
+"/*\n"
+"		{//	write back\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"			startAddrBlock = lIdx*4;\n"
+"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
+"\n"
+"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
+"			{\n"
+"				dataToSortOut[ startAddress+ie ] = myData[ie];\n"
+"			}\n"
+"		}\n"
+"*/\n"
+"		{\n"
+"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
+"			{\n"
+"				int dataIdx = startAddrBlock+ie;\n"
+"				int binIdx = (myData[ie].key>>m_startBit)&0xf;\n"
+"				int groupOffset = localHistogramToCarry[binIdx];\n"
+"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
+"				dataToSortOut[ groupOffset + myIdx ] = myData[ie];\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		GROUP_LDS_BARRIER;\n"
+"		if( lIdx < NUM_BUCKET )\n"
+"		{\n"
+"			localHistogramToCarry[lIdx] += myHistogram;\n"
+"		}\n"
+"		GROUP_LDS_BARRIER;\n"
+"	\n"
+"	}\n"
+"}\n"
+"\n"
+"/*\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void SortAndScatterKernel1( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )\n"
+"{\n"
+"	if( lIdx.x < (NUM_BUCKET) )\n"
+"	{\n"
+"		localHistogramToCarry[lIdx.x] = rHistogram[lIdx.x*m_nWorkGroupsToExecute + gIdx.x];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	for(uint igroup=gIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(gIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
+"	{\n"
+"		u32 myHistogram;\n"
+"\n"
+"		KeyValuePair myData[4];\n"
+"		uint startAddrBlock;\n"
+"		{	//	read data\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"			startAddrBlock = lIdx.x*4;\n"
+"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
+"\n"
+"			myData[0] = dataToSort[startAddress+0];\n"
+"			myData[1] = dataToSort[startAddress+1];\n"
+"			myData[2] = dataToSort[startAddress+2];\n"
+"			myData[3] = dataToSort[startAddress+3];\n"
+"		}\n"
+"\n"
+"		for(int ib=m_startBit; ib<m_startBit+BITS_PER_PASS; ib++)\n"
+"		{\n"
+"			uint4 keys = uint4(~(myData[0].key>>ib) & 0x1, ~(myData[1].key>>ib) & 0x1, ~(myData[2].key>>ib) & 0x1, ~(myData[3].key>>ib) & 0x1);\n"
+"			uint total;\n"
+"			uint4 rankOfP = localPrefixSum128V( keys, lIdx.x, total );\n"
+"			uint4 rankOfN = uint4(startAddrBlock, startAddrBlock+1, startAddrBlock+2, startAddrBlock+3) - rankOfP + uint4( total, total, total, total );\n"
+"\n"
+"			uint4 myAddr = (keys==uint4(1,1,1,1))? rankOfP: rankOfN;\n"
+"			\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			SET_LOCAL_SORT_DATA( myAddr.x, myData[0] );\n"
+"			SET_LOCAL_SORT_DATA( myAddr.y, myData[1] );\n"
+"			SET_LOCAL_SORT_DATA( myAddr.z, myData[2] );\n"
+"			SET_LOCAL_SORT_DATA( myAddr.w, myData[3] );\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"			\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+0, myData[0] );\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+1, myData[1] );\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+2, myData[2] );\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+3, myData[3] );\n"
+"		}\n"
+"		\n"
+"		{//	create histogram -> prefix sum\n"
+"			if( lIdx.x < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[lIdx.x] = 0;\n"
+"				localHistogram[NUM_BUCKET+lIdx.x] = 0;\n"
+"			}\n"
+"			GROUP_LDS_BARRIER;\n"
+"			uint4 keys = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);\n"
+"			\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.x], 1 );\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.y], 1 );\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.z], 1 );\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.w], 1 );\n"
+"			\n"
+"			GROUP_LDS_BARRIER;\n"
+"			\n"
+"			uint hIdx = NUM_BUCKET+lIdx.x;\n"
+"			if( lIdx.x < NUM_BUCKET )\n"
+"			{\n"
+"				myHistogram = localHistogram[hIdx];\n"
+"			}\n"
+"			GROUP_LDS_BARRIER;\n"
+"	\n"
+"\n"
+"			if( lIdx.x < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+"\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
+"			}\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"		}\n"
+"		{//	write back\n"
+"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
+"			{\n"
+"				int dataIdx = startAddrBlock+ie;\n"
+"				int binIdx = (myData[ie].key>>m_startBit)&0xf;\n"
+"				int groupOffset = localHistogramToCarry[binIdx];\n"
+"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
+"				\n"
+"				dataToSortOut[ groupOffset + myIdx ] = myData[ie];\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		GROUP_LDS_BARRIER;\n"
+"		if( lIdx.x < NUM_BUCKET )\n"
+"		{\n"
+"			localHistogramToCarry[lIdx.x] += myHistogram;\n"
+"		}\n"
+"		GROUP_LDS_BARRIER;\n"
+"	\n"
+"	}\n"
+"}\n"
+"*/\n"
+"\n"
+"StructuredBuffer<KeyValuePair> dataToSort1 : register( t0 );\n"
+"RWStructuredBuffer<u32> wHistogram1 : register(u0);\n"
+"\n"
+"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx.x]\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void StreamCountKernel( DEFAULT_ARGS )        \n"
+"{\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"\n"
+"	int myHistogram[NUM_BUCKET];\n"
+"\n"
+"	for(int i=0; i<NUM_BUCKET; i++)\n"
+"	{\n"
+"		MY_HISTOGRAM(i) = 0;\n"
+"	}\n"
+"\n"
+"	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
+"	{\n"
+"		uint localKeys[4];\n"
+"		{	//	read data\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"\n"
+"			uint4 localAddress = uint4(lIdx, lIdx, lIdx, lIdx)*4+uint4(0,1,2,3);\n"
+"			uint4 globalAddress = uint4(igroup,igroup,igroup,igroup)*numLocalElements + localAddress;\n"
+"\n"
+"			KeyValuePair localData0 = dataToSort1[globalAddress.x];\n"
+"			KeyValuePair localData1 = dataToSort1[globalAddress.y];\n"
+"			KeyValuePair localData2 = dataToSort1[globalAddress.z];\n"
+"			KeyValuePair localData3 = dataToSort1[globalAddress.w];\n"
+"\n"
+"			localKeys[0] = (localData0.key >> m_startBit) & 0xf;\n"
+"			localKeys[1] = (localData1.key >> m_startBit) & 0xf;\n"
+"			localKeys[2] = (localData2.key >> m_startBit) & 0xf;\n"
+"			localKeys[3] = (localData3.key >> m_startBit) & 0xf;\n"
+"		}\n"
+"\n"
+"		MY_HISTOGRAM( localKeys[0] )++;\n"
+"		MY_HISTOGRAM( localKeys[1] )++;\n"
+"		MY_HISTOGRAM( localKeys[2] )++;\n"
+"		MY_HISTOGRAM( localKeys[3] )++;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	reduce to 1\n"
+"		if( lIdx < 64 )//WG_SIZE/2 )\n"
+"		{\n"
+"			for(int i=0; i<NUM_BUCKET/2; i++)\n"
+"			{\n"
+"				int idx = lIdx;\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
+"			}\n"
+"		}\n"
+"		else if( lIdx < 128 )\n"
+"		{\n"
+"			for(int i=NUM_BUCKET/2; i<NUM_BUCKET; i++)\n"
+"			{\n"
+"				int idx = lIdx-64;\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	write data\n"
+"		if( lIdx < NUM_BUCKET )\n"
+"		{\n"
+"			wHistogram1[ lIdx*m_nWorkGroupsToExecute + wgIdx.x ] = localHistogramMat[ lIdx*WG_SIZE+0 ];\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"/*\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void StreamCountKernel( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )        \n"
+"{\n"
+"	int myHistogram[NUM_BUCKET];\n"
+"\n"
+"	for(int i=0; i<NUM_BUCKET; i++)\n"
+"	{\n"
+"		myHistogram[i] = 0;\n"
+"	}\n"
+"\n"
+"	for(uint igroup=gIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(gIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
+"	{\n"
+"		uint localKeys[4];\n"
+"		{	//	read data\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"\n"
+"			uint4 localAddress = uint4(lIdx.x, lIdx.x, lIdx.x, lIdx.x)*4+uint4(0,1,2,3);\n"
+"			uint4 globalAddress = uint4(igroup,igroup,igroup,igroup)*numLocalElements + localAddress;\n"
+"\n"
+"			KeyValuePair localData0 = dataToSort1[globalAddress.x];\n"
+"			KeyValuePair localData1 = dataToSort1[globalAddress.y];\n"
+"			KeyValuePair localData2 = dataToSort1[globalAddress.z];\n"
+"			KeyValuePair localData3 = dataToSort1[globalAddress.w];\n"
+"\n"
+"			localKeys[0] = (localData0.key >> m_startBit) & 0xf;\n"
+"			localKeys[1] = (localData1.key >> m_startBit) & 0xf;\n"
+"			localKeys[2] = (localData2.key >> m_startBit) & 0xf;\n"
+"			localKeys[3] = (localData3.key >> m_startBit) & 0xf;\n"
+"		}\n"
+"\n"
+"		myHistogram[ localKeys[0] ]++;\n"
+"		myHistogram[ localKeys[1] ]++;\n"
+"		myHistogram[ localKeys[2] ]++;\n"
+"		myHistogram[ localKeys[3] ]++;\n"
+"	}\n"
+"\n"
+"	{	//	move to shared\n"
+"		for(int i=0; i<NUM_BUCKET; i++)\n"
+"		{\n"
+"			localHistogramMat[i*WG_SIZE+lIdx.x] = myHistogram[i];\n"
+"		}\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	reduce to 1\n"
+"		if( lIdx.x < 64 )//WG_SIZE/2 )\n"
+"		{\n"
+"			for(int i=0; i<NUM_BUCKET/2; i++)\n"
+"			{\n"
+"				int idx = lIdx.x;\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
+"			}\n"
+"		}\n"
+"		else if( lIdx.x < 128 )\n"
+"		{\n"
+"			for(int i=NUM_BUCKET/2; i<NUM_BUCKET; i++)\n"
+"			{\n"
+"				int idx = lIdx.x-64;\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	write data\n"
+"		if( lIdx.x < NUM_BUCKET )\n"
+"		{\n"
+"			wHistogram1[ lIdx.x*m_nWorkGroupsToExecute + gIdx.x ] = localHistogramMat[ lIdx.x*WG_SIZE+0 ];\n"
+"		}\n"
+"	}\n"
+"}\n"
+"*/\n"
+"\n"
+"/*\n"
+"//	for MAX_WG_SIZE 20\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void PrefixScanKernel( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )        \n"
+"{\n"
+"	uint4 myData = uint4(0,0,0,0);\n"
+"	if( 4*lIdx.x+0 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
+"		myData.x = wHistogram1[4*lIdx.x+0];\n"
+"	if( 4*lIdx.x+1 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
+"		myData.y = wHistogram1[4*lIdx.x+1];\n"
+"	if( 4*lIdx.x+2 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
+"		myData.z = wHistogram1[4*lIdx.x+2];\n"
+"	if( 4*lIdx.x+3 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
+"		myData.w = wHistogram1[4*lIdx.x+3];\n"
+"\n"
+"	uint totalSum;\n"
+"\n"
+"	uint4 scanned = localPrefixSum128V( myData, lIdx.x, totalSum );\n"
+"\n"
+"	wHistogram1[4*lIdx.x+0] = scanned.x;\n"
+"	wHistogram1[4*lIdx.x+1] = scanned.y;\n"
+"	wHistogram1[4*lIdx.x+2] = scanned.z;\n"
+"	wHistogram1[4*lIdx.x+3] = scanned.w;\n"
+"}\n"
+"*/\n"
+"\n"
+"//	for MAX_WG_SIZE 80\n"
+"//	can hold up to WG_SIZE*12 (128*12 > 80*16 )\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void PrefixScanKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"\n"
+"	uint data[12] = {0,0,0,0,0,0,0,0,0,0,0,0};\n"
+"	for(int i=0; i<12; i++)\n"
+"	{\n"
+"		if( int(12*lIdx+i) < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
+"			data[i] = wHistogram1[12*lIdx+i];\n"
+"	}\n"
+"\n"
+"	uint4 myData = uint4(0,0,0,0);\n"
+"	myData.x = data[0] + data[1];\n"
+"	myData.y = data[2] + data[3];\n"
+"	myData.z = data[4] + data[5];\n"
+"	myData.w = data[6] + data[7];\n"
+"\n"
+"\n"
+"	uint totalSum;\n"
+"	uint4 scanned = localPrefixSum128V( myData, lIdx, totalSum );\n"
+"\n"
+"	data[11] = scanned.w + data[9] + data[10];\n"
+"	data[10] = scanned.w + data[9];\n"
+"	data[9] = scanned.w;\n"
+"	data[8] = scanned.z + data[6] + data[7];\n"
+"	data[7] = scanned.z + data[6];\n"
+"	data[6] = scanned.z;\n"
+"	data[5] = scanned.y + data[3] + data[4];\n"
+"	data[4] = scanned.y + data[3];\n"
+"	data[3] = scanned.y;\n"
+"	data[2] = scanned.x + data[0] + data[1];\n"
+"	data[1] = scanned.x + data[0];\n"
+"	data[0] = scanned.x;\n"
+"\n"
+"	for(int i=0; i<12; i++)\n"
+"	{\n"
+"		wHistogram1[12*lIdx+i] = data[i];\n"
+"	}\n"
+"}\n"
+"/*\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void PrefixScanKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"\n"
+"	uint data[8] = {0,0,0,0,0,0,0,0};\n"
+"	for(int i=0; i<8; i++)\n"
+"	{\n"
+"		if( int(8*lIdx+i) < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
+"			data[i] = wHistogram1[8*lIdx+i];\n"
+"	}\n"
+"\n"
+"	uint4 myData = uint4(0,0,0,0);\n"
+"	myData.x = data[0] + data[1];\n"
+"	myData.y = data[2] + data[3];\n"
+"	myData.z = data[4] + data[5];\n"
+"	myData.w = data[6] + data[7];\n"
+"\n"
+"\n"
+"	uint totalSum;\n"
+"	uint4 scanned = localPrefixSum128V( myData, lIdx, totalSum );\n"
+"\n"
+"	data[7] = scanned.w + data[6];\n"
+"	data[6] = scanned.w;// + data[5];\n"
+"	data[5] = scanned.z + data[4];\n"
+"	data[4] = scanned.z;// + data[3];\n"
+"	data[3] = scanned.y + data[2];\n"
+"	data[2] = scanned.y;// + data[1];\n"
+"	data[1] = scanned.x + data[0];\n"
+"	data[0] = scanned.x;\n"
+"\n"
+"	for(int i=0; i<8; i++)\n"
+"	{\n"
+"		wHistogram1[8*lIdx+i] = data[i];\n"
+"	}\n"
+"}\n"
+"*/\n"
+"\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void CopyKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"\n"
+"	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
+"	{\n"
+"		KeyValuePair myData[4];\n"
+"		uint startAddrBlock;\n"
+"		{	//	read data\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"			startAddrBlock = lIdx*4;\n"
+"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
+"\n"
+"			myData[0] = dataToSort[startAddress+0];\n"
+"			myData[1] = dataToSort[startAddress+1];\n"
+"			myData[2] = dataToSort[startAddress+2];\n"
+"			myData[3] = dataToSort[startAddress+3];\n"
+"		}\n"
+"\n"
+"		{\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
+"\n"
+"			dataToSortOut[startAddress+0] = myData[0];\n"
+"			dataToSortOut[startAddress+1] = myData[1];\n"
+"			dataToSortOut[startAddress+2] = myData[2];\n"
+"			dataToSortOut[startAddress+3] = myData[3];\n"
+"		}\n"
+"	}\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortHost.inl
@@ -0,0 +1,93 @@
+/*
+		2011 Takahiro Harada
+*/
+
+template<>
+class RadixSort<TYPE_HOST> : public RadixSortBase
+{
+	public:
+		struct Data
+		{
+			HostBuffer<SortData>* m_workBuffer;
+		};
+
+		enum
+		{
+			BITS_PER_PASS = 8, 
+			NUM_TABLES = (1<<BITS_PER_PASS),
+		};
+		
+		static
+		Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_STANDARD)
+		{
+			ADLASSERT( deviceData->m_type == TYPE_HOST );
+
+			Data* data = new Data;
+			data->m_workBuffer = new HostBuffer<SortData>( deviceData, maxSize );
+			return data;
+		}
+
+		static
+		void deallocate(Data* data)
+		{
+			delete data->m_workBuffer;
+			delete data;
+		}
+
+		static
+		void execute(Data* data, Buffer<SortData>& inout, int n, int sortBits = 32)
+		{
+			ADLASSERT( inout.getType() == TYPE_HOST );
+
+			int tables[NUM_TABLES];
+			int counter[NUM_TABLES];
+
+			SortData* src = inout.m_ptr;
+			SortData* dst = data->m_workBuffer->m_ptr;
+
+			int count=0;
+			for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
+			{
+				for(int i=0; i<NUM_TABLES; i++)
+				{
+					tables[i] = 0;
+				}
+
+				for(int i=0; i<n; i++)
+				{
+					int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
+					tables[tableIdx]++;
+				}
+
+				//	prefix scan
+				int sum = 0;
+				for(int i=0; i<NUM_TABLES; i++)
+				{
+					int iData = tables[i];
+					tables[i] = sum;
+					sum += iData;
+					counter[i] = 0;
+				}
+
+				//	distribute
+				for(int i=0; i<n; i++)
+				{
+					int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
+			
+					dst[tables[tableIdx] + counter[tableIdx]] = src[i];
+					counter[tableIdx] ++;
+				}
+
+				swap2( src, dst );
+				count++;
+			}
+
+			{
+				if (count&1)
+				//if( src != inout.m_ptr )
+				{
+					memcpy( dst, src, sizeof(SortData)*n );
+				}
+			}
+		}
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleCL.h
@@ -0,0 +1,134 @@
+static const char* radixSortSimpleKernelsCL = \
+	"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+	"\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"#define AtomInc(x) atom_inc(&(x))\n"
+	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+	"\n"
+	"\n"
+	"#define WG_SIZE 128\n"
+	"#define NUM_PER_WI 4\n"
+	"\n"
+	"\n"
+	"typedef struct\n"
+	"{\n"
+	"	u32 m_key;\n"
+	"	u32 m_value;\n"
+	"}SortData;\n"
+	"\n"
+	"\n"
+	"typedef struct\n"
+	"{\n"
+	"	u32 m_startBit;\n"
+	"	u32 m_numGroups;\n"
+	"	u32 m_padding[2];\n"
+	"} ConstBuffer;\n"
+	"\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"void LocalCountKernel(__global SortData* sortData,\n"
+	"						__global u32* ldsHistogramOut,\n"
+	"						ConstBuffer cb)\n"
+	"{\n"
+	"	__local u32 ldsHistogram[16][256];\n"
+	"\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	for(int i=0; i<16; i++)\n"
+	"	{\n"
+	"		ldsHistogram[i][lIdx] = 0.f;\n"
+	"		ldsHistogram[i][lIdx+128] = 0.f;\n"
+	"	}\n"
+	"\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"\n"
+	"	SortData datas[NUM_PER_WI];\n"
+	"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+	"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+	"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+	"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+	"\n"
+	"	datas[0].m_key = (datas[0].m_key >> cb.m_startBit) & 0xff;\n"
+	"	datas[1].m_key = (datas[1].m_key >> cb.m_startBit) & 0xff;\n"
+	"	datas[2].m_key = (datas[2].m_key >> cb.m_startBit) & 0xff;\n"
+	"	datas[3].m_key = (datas[3].m_key >> cb.m_startBit) & 0xff;\n"
+	"\n"
+	"	int tableIdx = lIdx%16;\n"
+	"\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);\n"
+	"\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"\n"
+	"	u32 sum0, sum1;\n"
+	"	sum0 = sum1 = 0;\n"
+	"	for(int i=0; i<16; i++)\n"
+	"	{\n"
+	"		sum0 += ldsHistogram[i][lIdx];\n"
+	"		sum1 += ldsHistogram[i][lIdx+128];\n"
+	"	}\n"
+	"\n"
+	"	ldsHistogramOut[lIdx*cb.m_numGroups+GET_GROUP_IDX] = sum0;\n"
+	"	ldsHistogramOut[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX] = sum1;\n"
+	"}\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"void ScatterKernel(__global SortData* sortData,\n"
+	"					__global SortData* sortDataOut,\n"
+	"					__global u32* scannedHistogram,\n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	__local u32 ldsCurrentLocation[256];\n"
+	"\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	{\n"
+	"		ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*cb.m_numGroups+GET_GROUP_IDX];\n"
+	"		ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX];\n"
+	"	}\n"
+	"\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"\n"
+	"	SortData datas[NUM_PER_WI];\n"
+	"	int keys[NUM_PER_WI];\n"
+	"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+	"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+	"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+	"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+	"\n"
+	"	keys[0] = (datas[0].m_key >> cb.m_startBit) & 0xff;\n"
+	"	keys[1] = (datas[1].m_key >> cb.m_startBit) & 0xff;\n"
+	"	keys[2] = (datas[2].m_key >> cb.m_startBit) & 0xff;\n"
+	"	keys[3] = (datas[3].m_key >> cb.m_startBit) & 0xff;\n"
+	"\n"
+	"	int dst[NUM_PER_WI];\n"
+	"	for(int i=0; i<WG_SIZE; i++)\n"
+	"	{\n"
+	"		if( i==lIdx )\n"
+	"		{\n"
+	"			AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);\n"
+	"			AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);\n"
+	"			AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);\n"
+	"			AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"	}\n"
+	"	sortDataOut[dst[0]] = datas[0];\n"
+	"	sortDataOut[dst[1]] = datas[1];\n"
+	"	sortDataOut[dst[2]] = datas[2];\n"
+	"	sortDataOut[dst[3]] = datas[3];\n"
+	"}\n"
+	"\n"
+	"";
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleDX11.h
@@ -0,0 +1,131 @@
+static const char* radixSortSimpleKernelsDX11 = \
+	"typedef uint u32;\n"
+	"\n"
+	"#define GET_GROUP_IDX groupIdx.x\n"
+	"#define GET_LOCAL_IDX localIdx.x\n"
+	"#define GET_GLOBAL_IDX globalIdx.x\n"
+	"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+	"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+	"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+	"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+	"\n"
+	"//	takahiro end\n"
+	"#define WG_SIZE 128\n"
+	"#define NUM_PER_WI 4\n"
+	"\n"
+	"#define GET_GROUP_SIZE WG_SIZE\n"
+	"\n"
+	"typedef struct\n"
+	"{\n"
+	"	u32 m_key;\n"
+	"	u32 m_value;\n"
+	"}SortData;\n"
+	"\n"
+	"cbuffer SortCB : register( b0 )\n"
+	"{\n"
+	"	u32 m_startBit;\n"
+	"	u32 m_numGroups;\n"
+	"	u32 m_padding[2];\n"
+	"};\n"
+	"\n"
+	"StructuredBuffer<SortData> sortData : register( t0 );\n"
+	"RWStructuredBuffer<u32> ldsHistogramOut : register( u0 );\n"
+	"\n"
+	"groupshared u32 ldsHistogram[16][256];\n"
+	"\n"
+	"[numthreads(WG_SIZE, 1, 1)]\n"
+	"void LocalCountKernel( DEFAULT_ARGS )\n"
+	"{\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	for(int i=0; i<16; i++)\n"
+	"	{\n"
+	"		ldsHistogram[i][lIdx] = 0.f;\n"
+	"		ldsHistogram[i][lIdx+128] = 0.f;\n"
+	"	}\n"
+	"\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"\n"
+	"	SortData datas[NUM_PER_WI];\n"
+	"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+	"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+	"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+	"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+	"\n"
+	"	datas[0].m_key = (datas[0].m_key >> m_startBit) & 0xff;\n"
+	"	datas[1].m_key = (datas[1].m_key >> m_startBit) & 0xff;\n"
+	"	datas[2].m_key = (datas[2].m_key >> m_startBit) & 0xff;\n"
+	"	datas[3].m_key = (datas[3].m_key >> m_startBit) & 0xff;\n"
+	"\n"
+	"	int tableIdx = lIdx%16;\n"
+	"\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);\n"
+	"\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"\n"
+	"	u32 sum0, sum1;\n"
+	"	sum0 = sum1 = 0;\n"
+	"	for(int i=0; i<16; i++)\n"
+	"	{\n"
+	"		sum0 += ldsHistogram[i][lIdx];\n"
+	"		sum1 += ldsHistogram[i][lIdx+128];\n"
+	"	}\n"
+	"\n"
+	"	ldsHistogramOut[lIdx*m_numGroups+GET_GROUP_IDX] = sum0;\n"
+	"	ldsHistogramOut[(lIdx+128)*m_numGroups+GET_GROUP_IDX] = sum1;\n"
+	"}\n"
+	"\n"
+	"\n"
+	"RWStructuredBuffer<SortData> sortDataOut : register( u0 );\n"
+	"RWStructuredBuffer<u32> scannedHistogram : register( u1 );\n"
+	"\n"
+	"groupshared u32 ldsCurrentLocation[256];\n"
+	"\n"
+	"[numthreads(WG_SIZE, 1, 1)]\n"
+	"void ScatterKernel( DEFAULT_ARGS )\n"
+	"{\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	{\n"
+	"		ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*m_numGroups+GET_GROUP_IDX];\n"
+	"		ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*m_numGroups+GET_GROUP_IDX];\n"
+	"	}\n"
+	"\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"\n"
+	"	SortData datas[NUM_PER_WI];\n"
+	"	int keys[NUM_PER_WI];\n"
+	"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+	"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+	"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+	"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+	"\n"
+	"	keys[0] = (datas[0].m_key >> m_startBit) & 0xff;\n"
+	"	keys[1] = (datas[1].m_key >> m_startBit) & 0xff;\n"
+	"	keys[2] = (datas[2].m_key >> m_startBit) & 0xff;\n"
+	"	keys[3] = (datas[3].m_key >> m_startBit) & 0xff;\n"
+	"\n"
+	"	int dst[NUM_PER_WI];\n"
+	"	for(int i=0; i<WG_SIZE; i++)\n"
+	"//	for(int i=0; i<m_padding[0]; i++)	//	to reduce compile time\n"
+	"	{\n"
+	"		if( i==lIdx )\n"
+	"		{\n"
+	"			AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);\n"
+	"			AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);\n"
+	"			AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);\n"
+	"			AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"	}\n"
+	"	sortDataOut[dst[0]] = datas[0];\n"
+	"	sortDataOut[dst[1]] = datas[1];\n"
+	"	sortDataOut[dst[2]] = datas[2];\n"
+	"	sortDataOut[dst[3]] = datas[3];\n"
+	"}\n"
+	"";
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernels.cl
@@ -0,0 +1,147 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Author Takahiro Harada
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+
+
+#define WG_SIZE 128
+#define NUM_PER_WI 4
+
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+
+typedef struct
+{
+	u32 m_startBit;
+	u32 m_numGroups;
+	u32 m_padding[2];
+} ConstBuffer;
+
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void LocalCountKernel(__global SortData* sortData, 
+						__global u32* ldsHistogramOut,
+						ConstBuffer cb)
+{
+	__local u32 ldsHistogram[16][256];
+
+	int lIdx = GET_LOCAL_IDX;
+	int gIdx = GET_GLOBAL_IDX;
+	
+	for(int i=0; i<16; i++)
+	{
+		ldsHistogram[i][lIdx] = 0.f;
+		ldsHistogram[i][lIdx+128] = 0.f;
+	}
+	
+	GROUP_LDS_BARRIER;
+	
+	SortData datas[NUM_PER_WI];
+	datas[0] = sortData[gIdx*NUM_PER_WI+0];
+	datas[1] = sortData[gIdx*NUM_PER_WI+1];
+	datas[2] = sortData[gIdx*NUM_PER_WI+2];
+	datas[3] = sortData[gIdx*NUM_PER_WI+3];
+
+	datas[0].m_key = (datas[0].m_key >> cb.m_startBit) & 0xff;
+	datas[1].m_key = (datas[1].m_key >> cb.m_startBit) & 0xff;
+	datas[2].m_key = (datas[2].m_key >> cb.m_startBit) & 0xff;
+	datas[3].m_key = (datas[3].m_key >> cb.m_startBit) & 0xff;
+
+	int tableIdx = lIdx%16;
+	
+	AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);
+	AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);
+	AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);
+	AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);
+
+	GROUP_LDS_BARRIER;
+	
+	u32 sum0, sum1;
+	sum0 = sum1 = 0;
+	for(int i=0; i<16; i++)
+	{
+		sum0 += ldsHistogram[i][lIdx];
+		sum1 += ldsHistogram[i][lIdx+128];
+	}
+
+	ldsHistogramOut[lIdx*cb.m_numGroups+GET_GROUP_IDX] = sum0;
+	ldsHistogramOut[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX] = sum1;
+}
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void ScatterKernel(__global SortData* sortData,
+					__global SortData* sortDataOut,
+					__global u32* scannedHistogram, 
+					ConstBuffer cb)
+{
+	__local u32 ldsCurrentLocation[256];
+
+	int lIdx = GET_LOCAL_IDX;
+	int gIdx = GET_GLOBAL_IDX;
+	
+	{
+		ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*cb.m_numGroups+GET_GROUP_IDX];
+		ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX];
+	}
+
+	GROUP_LDS_BARRIER;
+	
+	SortData datas[NUM_PER_WI];
+	int keys[NUM_PER_WI];
+	datas[0] = sortData[gIdx*NUM_PER_WI+0];
+	datas[1] = sortData[gIdx*NUM_PER_WI+1];
+	datas[2] = sortData[gIdx*NUM_PER_WI+2];
+	datas[3] = sortData[gIdx*NUM_PER_WI+3];
+
+	keys[0] = (datas[0].m_key >> cb.m_startBit) & 0xff;
+	keys[1] = (datas[1].m_key >> cb.m_startBit) & 0xff;
+	keys[2] = (datas[2].m_key >> cb.m_startBit) & 0xff;
+	keys[3] = (datas[3].m_key >> cb.m_startBit) & 0xff;
+
+	int dst[NUM_PER_WI];
+	for(int i=0; i<WG_SIZE; i++)
+	{
+		if( i==lIdx )
+		{
+			AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);
+			AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);
+			AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);
+			AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);
+		}
+		GROUP_LDS_BARRIER;
+	}
+	sortDataOut[dst[0]] = datas[0];
+	sortDataOut[dst[1]] = datas[1];
+	sortDataOut[dst[2]] = datas[2];
+	sortDataOut[dst[3]] = datas[3];
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernels.hlsl
@@ -0,0 +1,133 @@
+/*
+		2011 Takahiro Harada
+*/
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
+#define AtomInc(x) InterlockedAdd(x, 1)
+#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
+
+//	takahiro end
+#define WG_SIZE 128
+#define NUM_PER_WI 4
+
+#define GET_GROUP_SIZE WG_SIZE
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+cbuffer SortCB : register( b0 )
+{
+	u32 m_startBit;
+	u32 m_numGroups;
+	u32 m_padding[2];
+};
+ 
+StructuredBuffer<SortData> sortData : register( t0 );
+RWStructuredBuffer<u32> ldsHistogramOut : register( u0 );
+
+groupshared u32 ldsHistogram[16][256];
+
+[numthreads(WG_SIZE, 1, 1)]
+void LocalCountKernel( DEFAULT_ARGS )
+{
+	int lIdx = GET_LOCAL_IDX;
+	int gIdx = GET_GLOBAL_IDX;
+	
+	for(int i=0; i<16; i++)
+	{
+		ldsHistogram[i][lIdx] = 0.f;
+		ldsHistogram[i][lIdx+128] = 0.f;
+	}
+	
+	GROUP_LDS_BARRIER;
+	
+	SortData datas[NUM_PER_WI];
+	datas[0] = sortData[gIdx*NUM_PER_WI+0];
+	datas[1] = sortData[gIdx*NUM_PER_WI+1];
+	datas[2] = sortData[gIdx*NUM_PER_WI+2];
+	datas[3] = sortData[gIdx*NUM_PER_WI+3];
+
+	datas[0].m_key = (datas[0].m_key >> m_startBit) & 0xff;
+	datas[1].m_key = (datas[1].m_key >> m_startBit) & 0xff;
+	datas[2].m_key = (datas[2].m_key >> m_startBit) & 0xff;
+	datas[3].m_key = (datas[3].m_key >> m_startBit) & 0xff;
+
+	int tableIdx = lIdx%16;
+	
+	AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);
+	AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);
+	AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);
+	AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);
+
+	GROUP_LDS_BARRIER;
+	
+	u32 sum0, sum1;
+	sum0 = sum1 = 0;
+	for(int i=0; i<16; i++)
+	{
+		sum0 += ldsHistogram[i][lIdx];
+		sum1 += ldsHistogram[i][lIdx+128];
+	}
+
+	ldsHistogramOut[lIdx*m_numGroups+GET_GROUP_IDX] = sum0;
+	ldsHistogramOut[(lIdx+128)*m_numGroups+GET_GROUP_IDX] = sum1;
+}
+
+
+RWStructuredBuffer<SortData> sortDataOut : register( u0 );
+RWStructuredBuffer<u32> scannedHistogram : register( u1 );
+
+groupshared u32 ldsCurrentLocation[256];
+
+[numthreads(WG_SIZE, 1, 1)]
+void ScatterKernel( DEFAULT_ARGS )
+{
+	int lIdx = GET_LOCAL_IDX;
+	int gIdx = GET_GLOBAL_IDX;
+	
+	{
+		ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*m_numGroups+GET_GROUP_IDX];
+		ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*m_numGroups+GET_GROUP_IDX];
+	}
+
+	GROUP_LDS_BARRIER;
+	
+	SortData datas[NUM_PER_WI];
+	int keys[NUM_PER_WI];
+	datas[0] = sortData[gIdx*NUM_PER_WI+0];
+	datas[1] = sortData[gIdx*NUM_PER_WI+1];
+	datas[2] = sortData[gIdx*NUM_PER_WI+2];
+	datas[3] = sortData[gIdx*NUM_PER_WI+3];
+
+	keys[0] = (datas[0].m_key >> m_startBit) & 0xff;
+	keys[1] = (datas[1].m_key >> m_startBit) & 0xff;
+	keys[2] = (datas[2].m_key >> m_startBit) & 0xff;
+	keys[3] = (datas[3].m_key >> m_startBit) & 0xff;
+
+	int dst[NUM_PER_WI];
+	for(int i=0; i<WG_SIZE; i++)
+//	for(int i=0; i<m_padding[0]; i++)	//	to reduce compile time
+	{
+		if( i==lIdx )
+		{
+			AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);
+			AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);
+			AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);
+			AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);
+		}
+		GROUP_LDS_BARRIER;
+	}
+	sortDataOut[dst[0]] = datas[0];
+	sortDataOut[dst[1]] = datas[1];
+	sortDataOut[dst[2]] = datas[2];
+	sortDataOut[dst[3]] = datas[3];
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernelsCL.h
@@ -0,0 +1,149 @@
+static const char* radixSortSimpleKernelsCL= \
+"/*\n"
+"Bullet Continuous Collision Detection and Physics Library\n"
+"Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org\n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Author Takahiro Harada\n"
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"\n"
+"\n"
+"#define WG_SIZE 128\n"
+"#define NUM_PER_WI 4\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_startBit;\n"
+"	u32 m_numGroups;\n"
+"	u32 m_padding[2];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void LocalCountKernel(__global SortData* sortData, \n"
+"						__global u32* ldsHistogramOut,\n"
+"						ConstBuffer cb)\n"
+"{\n"
+"	__local u32 ldsHistogram[16][256];\n"
+"\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	\n"
+"	for(int i=0; i<16; i++)\n"
+"	{\n"
+"		ldsHistogram[i][lIdx] = 0.f;\n"
+"		ldsHistogram[i][lIdx+128] = 0.f;\n"
+"	}\n"
+"	\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	SortData datas[NUM_PER_WI];\n"
+"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+"\n"
+"	datas[0].m_key = (datas[0].m_key >> cb.m_startBit) & 0xff;\n"
+"	datas[1].m_key = (datas[1].m_key >> cb.m_startBit) & 0xff;\n"
+"	datas[2].m_key = (datas[2].m_key >> cb.m_startBit) & 0xff;\n"
+"	datas[3].m_key = (datas[3].m_key >> cb.m_startBit) & 0xff;\n"
+"\n"
+"	int tableIdx = lIdx%16;\n"
+"	\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	u32 sum0, sum1;\n"
+"	sum0 = sum1 = 0;\n"
+"	for(int i=0; i<16; i++)\n"
+"	{\n"
+"		sum0 += ldsHistogram[i][lIdx];\n"
+"		sum1 += ldsHistogram[i][lIdx+128];\n"
+"	}\n"
+"\n"
+"	ldsHistogramOut[lIdx*cb.m_numGroups+GET_GROUP_IDX] = sum0;\n"
+"	ldsHistogramOut[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX] = sum1;\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void ScatterKernel(__global SortData* sortData,\n"
+"					__global SortData* sortDataOut,\n"
+"					__global u32* scannedHistogram, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	__local u32 ldsCurrentLocation[256];\n"
+"\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	\n"
+"	{\n"
+"		ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*cb.m_numGroups+GET_GROUP_IDX];\n"
+"		ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	SortData datas[NUM_PER_WI];\n"
+"	int keys[NUM_PER_WI];\n"
+"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+"\n"
+"	keys[0] = (datas[0].m_key >> cb.m_startBit) & 0xff;\n"
+"	keys[1] = (datas[1].m_key >> cb.m_startBit) & 0xff;\n"
+"	keys[2] = (datas[2].m_key >> cb.m_startBit) & 0xff;\n"
+"	keys[3] = (datas[3].m_key >> cb.m_startBit) & 0xff;\n"
+"\n"
+"	int dst[NUM_PER_WI];\n"
+"	for(int i=0; i<WG_SIZE; i++)\n"
+"	{\n"
+"		if( i==lIdx )\n"
+"		{\n"
+"			AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);\n"
+"			AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);\n"
+"			AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);\n"
+"			AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);\n"
+"		}\n"
+"		GROUP_LDS_BARRIER;\n"
+"	}\n"
+"	sortDataOut[dst[0]] = datas[0];\n"
+"	sortDataOut[dst[1]] = datas[1];\n"
+"	sortDataOut[dst[2]] = datas[2];\n"
+"	sortDataOut[dst[3]] = datas[3];\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernelsDX11.h
@@ -0,0 +1,135 @@
+static const char* radixSortSimpleKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+"\n"
+"//	takahiro end\n"
+"#define WG_SIZE 128\n"
+"#define NUM_PER_WI 4\n"
+"\n"
+"#define GET_GROUP_SIZE WG_SIZE\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"cbuffer SortCB : register( b0 )\n"
+"{\n"
+"	u32 m_startBit;\n"
+"	u32 m_numGroups;\n"
+"	u32 m_padding[2];\n"
+"};\n"
+" \n"
+"StructuredBuffer<SortData> sortData : register( t0 );\n"
+"RWStructuredBuffer<u32> ldsHistogramOut : register( u0 );\n"
+"\n"
+"groupshared u32 ldsHistogram[16][256];\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void LocalCountKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	\n"
+"	for(int i=0; i<16; i++)\n"
+"	{\n"
+"		ldsHistogram[i][lIdx] = 0.f;\n"
+"		ldsHistogram[i][lIdx+128] = 0.f;\n"
+"	}\n"
+"	\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	SortData datas[NUM_PER_WI];\n"
+"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+"\n"
+"	datas[0].m_key = (datas[0].m_key >> m_startBit) & 0xff;\n"
+"	datas[1].m_key = (datas[1].m_key >> m_startBit) & 0xff;\n"
+"	datas[2].m_key = (datas[2].m_key >> m_startBit) & 0xff;\n"
+"	datas[3].m_key = (datas[3].m_key >> m_startBit) & 0xff;\n"
+"\n"
+"	int tableIdx = lIdx%16;\n"
+"	\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	u32 sum0, sum1;\n"
+"	sum0 = sum1 = 0;\n"
+"	for(int i=0; i<16; i++)\n"
+"	{\n"
+"		sum0 += ldsHistogram[i][lIdx];\n"
+"		sum1 += ldsHistogram[i][lIdx+128];\n"
+"	}\n"
+"\n"
+"	ldsHistogramOut[lIdx*m_numGroups+GET_GROUP_IDX] = sum0;\n"
+"	ldsHistogramOut[(lIdx+128)*m_numGroups+GET_GROUP_IDX] = sum1;\n"
+"}\n"
+"\n"
+"\n"
+"RWStructuredBuffer<SortData> sortDataOut : register( u0 );\n"
+"RWStructuredBuffer<u32> scannedHistogram : register( u1 );\n"
+"\n"
+"groupshared u32 ldsCurrentLocation[256];\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void ScatterKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	\n"
+"	{\n"
+"		ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*m_numGroups+GET_GROUP_IDX];\n"
+"		ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*m_numGroups+GET_GROUP_IDX];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	SortData datas[NUM_PER_WI];\n"
+"	int keys[NUM_PER_WI];\n"
+"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+"\n"
+"	keys[0] = (datas[0].m_key >> m_startBit) & 0xff;\n"
+"	keys[1] = (datas[1].m_key >> m_startBit) & 0xff;\n"
+"	keys[2] = (datas[2].m_key >> m_startBit) & 0xff;\n"
+"	keys[3] = (datas[3].m_key >> m_startBit) & 0xff;\n"
+"\n"
+"	int dst[NUM_PER_WI];\n"
+"	for(int i=0; i<WG_SIZE; i++)\n"
+"//	for(int i=0; i<m_padding[0]; i++)	//	to reduce compile time\n"
+"	{\n"
+"		if( i==lIdx )\n"
+"		{\n"
+"			AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);\n"
+"			AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);\n"
+"			AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);\n"
+"			AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);\n"
+"		}\n"
+"		GROUP_LDS_BARRIER;\n"
+"	}\n"
+"	sortDataOut[dst[0]] = datas[0];\n"
+"	sortDataOut[dst[1]] = datas[1];\n"
+"	sortDataOut[dst[2]] = datas[2];\n"
+"	sortDataOut[dst[3]] = datas[3];\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandard.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandard.inl
@@ -0,0 +1,177 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Sort\\RadixSortStandardKernels"
+#define KERNEL0 "LocalSortKernel"
+#define KERNEL1 "ScatterKernel"
+#define KERNEL2 "CopyKernel"
+
+#include <AdlPrimitives/Sort/RadixSortStandardKernelsCL.h>
+#include <AdlPrimitives/Sort/RadixSortStandardKernelsDX11.h>
+
+template<DeviceType type>
+class RadixSortStandard : public RadixSortBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		enum
+		{
+			WG_SIZE = 128,
+			NUM_PER_WI = 4,
+
+			BITS_PER_PASS = 4,
+		};
+
+		struct Data : public RadixSort<type>::Data
+		{
+			Kernel* m_localSortKernel;
+			Kernel* m_scatterKernel;
+			Kernel* m_copyKernel;
+
+			Buffer<u32>* m_workBuffer0;
+			Buffer<u32>* m_workBuffer1;
+			Buffer<u32>* m_workBuffer2;
+			Buffer<SortData>* m_workBuffer3;
+			Buffer<int4>* m_constBuffer[32/BITS_PER_PASS];
+		};
+		
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_NORMAL);
+
+		static
+		void deallocate(void* data);
+
+		static
+		void execute(void* data, Buffer<SortData>& inout, int n, int sortBits);
+};
+
+template<DeviceType type>
+typename RadixSortStandard<type>::Data* RadixSortStandard<type>::allocate(const Device* deviceData, int maxSize, Option option)
+{
+	ADLASSERT( type == deviceData->m_type );
+
+	u32 maxNumGroups = (maxSize+WG_SIZE*NUM_PER_WI-1)/(WG_SIZE*NUM_PER_WI);
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+	{radixSortStandardKernelsCL,radixSortStandardKernelsDX11};
+//	ADLASSERT(0);
+#else
+	{0,0};
+#endif	
+
+	Data* data = new Data;
+	data->m_option = option;
+	data->m_deviceData = deviceData;
+
+	data->m_localSortKernel = deviceData->getKernel( PATH, KERNEL0, 0, src[type] );
+	data->m_scatterKernel = deviceData->getKernel( PATH, KERNEL1, 0, src[type] );
+	data->m_copyKernel = deviceData->getKernel( PATH, KERNEL2, 0, src[type] );
+
+	//	is this correct?
+	data->m_scanData = PrefixScan<type>::allocate( deviceData, maxNumGroups*(1<<BITS_PER_PASS) );
+
+	data->m_workBuffer0 = new Buffer<u32>( deviceData, maxNumGroups*(1<<BITS_PER_PASS) );
+	data->m_workBuffer1 = new Buffer<u32>( deviceData, maxNumGroups*(1<<BITS_PER_PASS) );
+	data->m_workBuffer2 = new Buffer<u32>( deviceData, maxNumGroups*(1<<BITS_PER_PASS) );
+	data->m_workBuffer3 = new Buffer<SortData>( deviceData, maxSize );
+	for(int i=0; i<32/BITS_PER_PASS; i++)
+		data->m_constBuffer[i] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
+	data->m_maxSize = maxSize;
+
+	return data;
+}
+
+template<DeviceType type>
+void RadixSortStandard<type>::deallocate(void* rawData)
+{
+	Data* data = (Data*)rawData;
+
+	delete data->m_workBuffer0;
+	delete data->m_workBuffer1;
+	delete data->m_workBuffer2;
+	delete data->m_workBuffer3;
+	for(int i=0; i<32/BITS_PER_PASS; i++)
+		delete data->m_constBuffer[i];
+	
+	PrefixScan<type>::deallocate( data->m_scanData );
+
+	delete data;
+}
+
+template<DeviceType type>
+void RadixSortStandard<type>::execute(void* rawData, Buffer<SortData>& inout, int n, int sortBits)
+{
+	Data* data = (Data*)rawData;
+
+	ADLASSERT( n%512 == 0 );
+	ADLASSERT( n <= data->m_maxSize );
+	ADLASSERT( NUM_PER_WI == 4 );
+
+	Buffer<SortData>* src = BufferUtils::map<type, true>( data->m_deviceData, &inout );
+	Buffer<SortData>* dst = data->m_workBuffer3;
+
+	const Device* deviceData = data->m_deviceData;
+
+	int numGroups = (n+WG_SIZE*NUM_PER_WI-1)/(WG_SIZE*NUM_PER_WI);
+
+	int4 constBuffer;
+
+	int iPass = 0;
+	for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS, iPass++)
+	{
+		constBuffer.x = startBit;
+		constBuffer.y = numGroups;
+		constBuffer.z = WG_SIZE;
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( src ), BufferInfo( data->m_workBuffer0 ), BufferInfo( data->m_workBuffer1 ) };
+
+			Launcher launcher( deviceData, data->m_localSortKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
+			launcher.launch1D( WG_SIZE*numGroups, WG_SIZE );
+		}
+
+		PrefixScan<type>::execute( data->m_scanData, *data->m_workBuffer0, *data->m_workBuffer2, numGroups*(1<<BITS_PER_PASS) );
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( data->m_workBuffer2, true ), BufferInfo( data->m_workBuffer1, true ),
+				BufferInfo( dst ) };
+
+			Launcher launcher( deviceData, data->m_scatterKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
+			launcher.launch1D( WG_SIZE*numGroups, WG_SIZE );
+		}
+
+		if(0)
+		{
+			BufferInfo bInfo[] = { BufferInfo( dst, true ), BufferInfo( src ) };
+
+			Launcher launcher( deviceData, data->m_copyKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.launch1D( n, WG_SIZE );
+		}
+		swap2( src, dst );
+	}
+
+	if( src != &inout )
+	{
+		BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( dst ) };
+
+		Launcher launcher( deviceData, data->m_copyKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.launch1D( n, WG_SIZE );
+	}
+
+	BufferUtils::unmap<true>( src, &inout );
+}
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernels.cl
@@ -0,0 +1,345 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Author Takahiro Harada
+
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+
+#define make_uint4 (uint4)
+#define make_uint2 (uint2)
+
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+
+#define WG_SIZE 128
+#define NUM_PER_WI 4
+
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+
+typedef struct
+{
+	u32 m_startBit;
+	u32 m_numGroups;
+	u32 m_padding[2];
+} ConstBuffer;
+
+#define BITS_PER_PASS 4
+
+
+
+uint4 prefixScanVector( uint4 data )
+{
+	data.y += data.x;
+	data.w += data.z;
+	data.z += data.y;
+	data.w += data.y;
+	return data;
+}
+
+uint prefixScanVectorEx( uint4* data )
+{
+	uint4 backup = data[0];
+	data[0].y += data[0].x;
+	data[0].w += data[0].z;
+	data[0].z += data[0].y;
+	data[0].w += data[0].y;
+	uint sum = data[0].w;
+	*data -= backup;
+	return sum;
+}
+
+uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32 sorterSharedMemory[] )
+{
+	{	//	Set data
+		sorterSharedMemory[lIdx] = 0;
+		sorterSharedMemory[lIdx+WG_SIZE] = prefixScanVectorEx( &pData );
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	Prefix sum
+		int idx = 2*lIdx + (WG_SIZE+1);
+		if( lIdx < 64 )
+		{
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];					
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
+			GROUP_MEM_FENCE;
+
+			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
+			GROUP_MEM_FENCE;
+		}
+	}
+
+	GROUP_LDS_BARRIER;
+
+	*totalSum = sorterSharedMemory[WG_SIZE*2-1];
+	uint addValue = sorterSharedMemory[lIdx+127];
+	return pData + make_uint4(addValue, addValue, addValue, addValue);
+}
+
+
+void generateHistogram(u32 lIdx, u32 wgIdx, 
+		uint4 sortedData,
+		__local u32 *histogram)
+{
+    if( lIdx < (1<<BITS_PER_PASS) )
+    {
+    	histogram[lIdx] = 0;
+    }
+
+	int mask = ((1<<BITS_PER_PASS)-1);
+	uint4 keys = make_uint4( (sortedData.x)&mask, (sortedData.y)&mask, (sortedData.z)&mask, (sortedData.w)&mask );
+
+	GROUP_LDS_BARRIER;
+	
+	AtomInc( histogram[keys.x] );
+	AtomInc( histogram[keys.y] );
+	AtomInc( histogram[keys.z] );
+	AtomInc( histogram[keys.w] );
+}
+
+//
+//
+//
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void LocalSortKernel(__global SortData* sortDataIn, 
+						__global u32* ldsHistogramOut0,
+						__global u32* ldsHistogramOut1,
+						ConstBuffer cb)
+{
+
+	__local u32 ldsSortData[ WG_SIZE*NUM_PER_WI + 16 ];
+
+	int nElemsPerWG = WG_SIZE*NUM_PER_WI;
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+
+    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
+
+
+	SortData sortData[NUM_PER_WI];
+
+	{
+		u32 offset = nElemsPerWG*wgIdx;
+		sortData[0] = sortDataIn[offset+localAddr.x];
+		sortData[1] = sortDataIn[offset+localAddr.y];
+		sortData[2] = sortDataIn[offset+localAddr.z];
+		sortData[3] = sortDataIn[offset+localAddr.w];
+	}
+
+	int bitIdx = cb.m_startBit;
+	do
+	{
+//	what is this?
+//		if( lIdx == wgSize-1 ) ldsSortData[256] = sortData[3].m_key;
+		u32 mask = (1<<bitIdx);
+		uint4 cmpResult = make_uint4( sortData[0].m_key & mask, sortData[1].m_key & mask, sortData[2].m_key & mask, sortData[3].m_key & mask );
+		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );
+		u32 total;
+		prefixSum = localPrefixSum128V( prefixSum, lIdx, &total, ldsSortData );
+
+		{
+			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );
+			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );
+
+			GROUP_LDS_BARRIER;
+
+			ldsSortData[dstAddr.x] = sortData[0].m_key;
+			ldsSortData[dstAddr.y] = sortData[1].m_key;
+			ldsSortData[dstAddr.z] = sortData[2].m_key;
+			ldsSortData[dstAddr.w] = sortData[3].m_key;
+
+			GROUP_LDS_BARRIER;
+
+			sortData[0].m_key = ldsSortData[localAddr.x];
+			sortData[1].m_key = ldsSortData[localAddr.y];
+			sortData[2].m_key = ldsSortData[localAddr.z];
+			sortData[3].m_key = ldsSortData[localAddr.w];
+
+			GROUP_LDS_BARRIER;
+
+			ldsSortData[dstAddr.x] = sortData[0].m_value;
+			ldsSortData[dstAddr.y] = sortData[1].m_value;
+			ldsSortData[dstAddr.z] = sortData[2].m_value;
+			ldsSortData[dstAddr.w] = sortData[3].m_value;
+
+			GROUP_LDS_BARRIER;
+
+			sortData[0].m_value = ldsSortData[localAddr.x];
+			sortData[1].m_value = ldsSortData[localAddr.y];
+			sortData[2].m_value = ldsSortData[localAddr.z];
+			sortData[3].m_value = ldsSortData[localAddr.w];
+
+			GROUP_LDS_BARRIER;
+		}
+		bitIdx ++;
+	}
+	while( bitIdx <(cb.m_startBit+BITS_PER_PASS) );
+
+	{	//	generate historgram
+		uint4 localKeys = make_uint4( sortData[0].m_key>>cb.m_startBit, sortData[1].m_key>>cb.m_startBit, 
+			sortData[2].m_key>>cb.m_startBit, sortData[3].m_key>>cb.m_startBit );
+
+		generateHistogram( lIdx, wgIdx, localKeys, ldsSortData );
+
+		GROUP_LDS_BARRIER;
+
+		int nBins = (1<<BITS_PER_PASS);
+		if( lIdx < nBins )
+		{
+     		u32 histValues = ldsSortData[lIdx];
+
+     		u32 globalAddresses = nBins*wgIdx + lIdx;
+     		u32 globalAddressesRadixMajor = cb.m_numGroups*lIdx + wgIdx;
+		
+     		ldsHistogramOut0[globalAddressesRadixMajor] = histValues;
+     		ldsHistogramOut1[globalAddresses] = histValues;
+		}
+	}
+
+
+	{	//	write
+		u32 offset = nElemsPerWG*wgIdx;
+		uint4 dstAddr = make_uint4(offset+localAddr.x, offset+localAddr.y, offset+localAddr.z, offset+localAddr.w );
+
+		sortDataIn[ dstAddr.x + 0 ] = sortData[0];
+		sortDataIn[ dstAddr.x + 1 ] = sortData[1];
+		sortDataIn[ dstAddr.x + 2 ] = sortData[2];
+		sortDataIn[ dstAddr.x + 3 ] = sortData[3];
+	}
+}
+
+
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void ScatterKernel(__global SortData *src,
+		__global u32 *histogramGlobalRadixMajor,
+		__global u32 *histogramLocalGroupMajor,
+		__global SortData *dst,
+		ConstBuffer cb)
+{
+	__local u32 sorterLocalMemory[3*(1<<BITS_PER_PASS)];
+	__local u32 *ldsLocalHistogram = sorterLocalMemory + (1<<BITS_PER_PASS);
+	__local u32 *ldsGlobalHistogram = sorterLocalMemory;
+
+
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 ldsOffset = (1<<BITS_PER_PASS);
+
+	//	load and prefix scan local histogram
+	if( lIdx < ((1<<BITS_PER_PASS)/2) )
+	{
+		uint2 myIdx = make_uint2(lIdx, lIdx+8);
+
+		ldsLocalHistogram[ldsOffset+myIdx.x] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.x];
+		ldsLocalHistogram[ldsOffset+myIdx.y] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.y];
+		ldsLocalHistogram[ldsOffset+myIdx.x-(1<<BITS_PER_PASS)] = 0;
+		ldsLocalHistogram[ldsOffset+myIdx.y-(1<<BITS_PER_PASS)] = 0;
+
+		int idx = ldsOffset+2*lIdx;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-1];
+		GROUP_MEM_FENCE;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-2];
+		GROUP_MEM_FENCE;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-4];
+		GROUP_MEM_FENCE;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-8];
+		GROUP_MEM_FENCE;
+
+		// Propagate intermediate values through
+		ldsLocalHistogram[idx-1] += ldsLocalHistogram[idx-2];
+		GROUP_MEM_FENCE;
+
+		// Grab and propagate for whole WG - loading the - 1 value
+		uint2 localValues;
+		localValues.x = ldsLocalHistogram[ldsOffset+myIdx.x-1];
+		localValues.y = ldsLocalHistogram[ldsOffset+myIdx.y-1];
+
+		ldsLocalHistogram[myIdx.x] = localValues.x;
+		ldsLocalHistogram[myIdx.y] = localValues.y;
+
+
+		ldsGlobalHistogram[myIdx.x] = histogramGlobalRadixMajor[cb.m_numGroups*myIdx.x + wgIdx];
+		ldsGlobalHistogram[myIdx.y] = histogramGlobalRadixMajor[cb.m_numGroups*myIdx.y + wgIdx];
+	}
+
+	GROUP_LDS_BARRIER;
+
+    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
+
+	SortData sortData[4];
+	{
+	    uint4 globalAddr = wgIdx*WG_SIZE*NUM_PER_WI + localAddr;
+		sortData[0] = src[globalAddr.x];
+		sortData[1] = src[globalAddr.y];
+		sortData[2] = src[globalAddr.z];
+		sortData[3] = src[globalAddr.w];
+	}
+
+	uint cmpValue = ((1<<BITS_PER_PASS)-1);
+	uint4 radix = make_uint4( (sortData[0].m_key>>cb.m_startBit)&cmpValue, (sortData[1].m_key>>cb.m_startBit)&cmpValue, 
+		(sortData[2].m_key>>cb.m_startBit)&cmpValue, (sortData[3].m_key>>cb.m_startBit)&cmpValue );;
+
+	//	data is already sorted. So simply subtract local prefix sum
+	uint4 dstAddr;
+	dstAddr.x = ldsGlobalHistogram[radix.x] + (localAddr.x - ldsLocalHistogram[radix.x]);
+	dstAddr.y = ldsGlobalHistogram[radix.y] + (localAddr.y - ldsLocalHistogram[radix.y]);
+	dstAddr.z = ldsGlobalHistogram[radix.z] + (localAddr.z - ldsLocalHistogram[radix.z]);
+	dstAddr.w = ldsGlobalHistogram[radix.w] + (localAddr.w - ldsLocalHistogram[radix.w]);
+
+	dst[dstAddr.x] = sortData[0];
+	dst[dstAddr.y] = sortData[1];
+	dst[dstAddr.z] = sortData[2];
+	dst[dstAddr.w] = sortData[3];
+}
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void CopyKernel(__global SortData *src, __global SortData *dst)
+{
+	dst[ GET_GLOBAL_IDX ] = src[ GET_GLOBAL_IDX ];
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernels.hlsl
@@ -0,0 +1,322 @@
+/*
+		2011 Takahiro Harada
+*/
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+#define GROUP_MEM_FENCE
+#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
+#define AtomInc(x) InterlockedAdd(x, 1)
+#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
+
+#define make_uint4 uint4
+#define make_uint2 uint2
+
+uint4 SELECT_UINT4(uint4 b,uint4 a,uint4 condition ){ return  make_uint4( ((condition).x)?a.x:b.x, ((condition).y)?a.y:b.y, ((condition).z)?a.z:b.z, ((condition).w)?a.w:b.w ); }
+
+//	takahiro end
+#define WG_SIZE 128
+#define NUM_PER_WI 4
+
+#define GET_GROUP_SIZE WG_SIZE
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+cbuffer SortCB : register( b0 )
+{
+	u32 m_startBit;
+	u32 m_numGroups;
+	u32 m_padding[2];
+};
+
+#define BITS_PER_PASS 4
+
+
+uint4 prefixScanVector( uint4 data )
+{
+	data.y += data.x;
+	data.w += data.z;
+	data.z += data.y;
+	data.w += data.y;
+	return data;
+}
+
+uint prefixScanVectorEx( inout uint4 data )
+{
+	uint4 backup = data;
+	data.y += data.x;
+	data.w += data.z;
+	data.z += data.y;
+	data.w += data.y;
+	uint sum = data.w;
+	data -= backup;
+	return sum;
+}
+
+
+
+RWStructuredBuffer<SortData> sortDataIn : register( u0 );
+RWStructuredBuffer<u32> ldsHistogramOut0 : register( u1 );
+RWStructuredBuffer<u32> ldsHistogramOut1 : register( u2 );
+
+groupshared u32 ldsSortData[ WG_SIZE*NUM_PER_WI + 16 ];
+
+
+uint4 localPrefixSum128V( uint4 pData, uint lIdx, inout uint totalSum )
+{
+	{	//	Set data
+		ldsSortData[lIdx] = 0;
+		ldsSortData[lIdx+WG_SIZE] = prefixScanVectorEx( pData );
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	Prefix sum
+		int idx = 2*lIdx + (WG_SIZE+1);
+		if( lIdx < 64 )
+		{
+			ldsSortData[idx] += ldsSortData[idx-1];
+			GROUP_MEM_FENCE;
+			ldsSortData[idx] += ldsSortData[idx-2];					
+			GROUP_MEM_FENCE;
+			ldsSortData[idx] += ldsSortData[idx-4];
+			GROUP_MEM_FENCE;
+			ldsSortData[idx] += ldsSortData[idx-8];
+			GROUP_MEM_FENCE;
+			ldsSortData[idx] += ldsSortData[idx-16];
+			GROUP_MEM_FENCE;
+			ldsSortData[idx] += ldsSortData[idx-32];		
+			GROUP_MEM_FENCE;
+			ldsSortData[idx] += ldsSortData[idx-64];
+			GROUP_MEM_FENCE;
+
+			ldsSortData[idx-1] += ldsSortData[idx-2];
+			GROUP_MEM_FENCE;
+		}
+	}
+
+	GROUP_LDS_BARRIER;
+
+	totalSum = ldsSortData[WG_SIZE*2-1];
+	uint addValue = ldsSortData[lIdx+127];
+	return pData + make_uint4(addValue, addValue, addValue, addValue);
+}
+
+void generateHistogram(u32 lIdx, u32 wgIdx, 
+		uint4 sortedData)
+{
+    if( lIdx < (1<<BITS_PER_PASS) )
+    {
+    	ldsSortData[lIdx] = 0;
+    }
+
+	int mask = ((1<<BITS_PER_PASS)-1);
+	uint4 keys = make_uint4( (sortedData.x)&mask, (sortedData.y)&mask, (sortedData.z)&mask, (sortedData.w)&mask );
+
+	GROUP_LDS_BARRIER;
+	
+	AtomInc( ldsSortData[keys.x] );
+	AtomInc( ldsSortData[keys.y] );
+	AtomInc( ldsSortData[keys.z] );
+	AtomInc( ldsSortData[keys.w] );
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void LocalSortKernel( DEFAULT_ARGS )
+{
+	int nElemsPerWG = WG_SIZE*NUM_PER_WI;
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+
+    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
+
+
+	SortData sortData[NUM_PER_WI];
+
+	{
+		u32 offset = nElemsPerWG*wgIdx;
+		sortData[0] = sortDataIn[offset+localAddr.x];
+		sortData[1] = sortDataIn[offset+localAddr.y];
+		sortData[2] = sortDataIn[offset+localAddr.z];
+		sortData[3] = sortDataIn[offset+localAddr.w];
+	}
+
+	int bitIdx = m_startBit;
+	do
+	{
+//	what is this?
+//		if( lIdx == wgSize-1 ) ldsSortData[256] = sortData[3].m_key;
+		u32 mask = (1<<bitIdx);
+		uint4 cmpResult = make_uint4( sortData[0].m_key & mask, sortData[1].m_key & mask, sortData[2].m_key & mask, sortData[3].m_key & mask );
+		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );
+		u32 total;
+		prefixSum = localPrefixSum128V( prefixSum, lIdx, total );
+
+		{
+			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );
+			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );
+
+			GROUP_LDS_BARRIER;
+
+			ldsSortData[dstAddr.x] = sortData[0].m_key;
+			ldsSortData[dstAddr.y] = sortData[1].m_key;
+			ldsSortData[dstAddr.z] = sortData[2].m_key;
+			ldsSortData[dstAddr.w] = sortData[3].m_key;
+
+			GROUP_LDS_BARRIER;
+
+			sortData[0].m_key = ldsSortData[localAddr.x];
+			sortData[1].m_key = ldsSortData[localAddr.y];
+			sortData[2].m_key = ldsSortData[localAddr.z];
+			sortData[3].m_key = ldsSortData[localAddr.w];
+
+			GROUP_LDS_BARRIER;
+
+			ldsSortData[dstAddr.x] = sortData[0].m_value;
+			ldsSortData[dstAddr.y] = sortData[1].m_value;
+			ldsSortData[dstAddr.z] = sortData[2].m_value;
+			ldsSortData[dstAddr.w] = sortData[3].m_value;
+
+			GROUP_LDS_BARRIER;
+
+			sortData[0].m_value = ldsSortData[localAddr.x];
+			sortData[1].m_value = ldsSortData[localAddr.y];
+			sortData[2].m_value = ldsSortData[localAddr.z];
+			sortData[3].m_value = ldsSortData[localAddr.w];
+
+			GROUP_LDS_BARRIER;
+		}
+		bitIdx ++;
+	}
+	while( bitIdx <(m_startBit+BITS_PER_PASS) );
+
+	{	//	generate historgram
+		uint4 localKeys = make_uint4( sortData[0].m_key>>m_startBit, sortData[1].m_key>>m_startBit, 
+			sortData[2].m_key>>m_startBit, sortData[3].m_key>>m_startBit );
+
+		generateHistogram( lIdx, wgIdx, localKeys );
+
+		GROUP_LDS_BARRIER;
+
+		int nBins = (1<<BITS_PER_PASS);
+		if( lIdx < nBins )
+		{
+     		u32 histValues = ldsSortData[lIdx];
+
+     		u32 globalAddresses = nBins*wgIdx + lIdx;
+     		u32 globalAddressesRadixMajor = m_numGroups*lIdx + wgIdx;
+		
+     		ldsHistogramOut0[globalAddressesRadixMajor] = histValues;
+     		ldsHistogramOut1[globalAddresses] = histValues;
+		}
+	}
+
+	{	//	write
+		u32 offset = nElemsPerWG*wgIdx;
+		uint4 dstAddr = make_uint4(offset+localAddr.x, offset+localAddr.y, offset+localAddr.z, offset+localAddr.w );
+
+		sortDataIn[ dstAddr.x + 0 ] = sortData[0];
+		sortDataIn[ dstAddr.x + 1 ] = sortData[1];
+		sortDataIn[ dstAddr.x + 2 ] = sortData[2];
+		sortDataIn[ dstAddr.x + 3 ] = sortData[3];
+	}
+}
+
+StructuredBuffer<SortData> src : register( t0 );
+StructuredBuffer<u32> histogramGlobalRadixMajor : register( t1 );
+StructuredBuffer<u32> histogramLocalGroupMajor : register( t2 );
+
+RWStructuredBuffer<SortData> dst : register( u0 );
+
+groupshared u32 ldsLocalHistogram[ 2*(1<<BITS_PER_PASS) ];
+groupshared u32 ldsGlobalHistogram[ (1<<BITS_PER_PASS) ];
+
+
+[numthreads(WG_SIZE, 1, 1)]
+void ScatterKernel( DEFAULT_ARGS )
+{
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 ldsOffset = (1<<BITS_PER_PASS);
+
+	//	load and prefix scan local histogram
+	if( lIdx < ((1<<BITS_PER_PASS)/2) )
+	{
+		uint2 myIdx = make_uint2(lIdx, lIdx+8);
+
+		ldsLocalHistogram[ldsOffset+myIdx.x] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.x];
+		ldsLocalHistogram[ldsOffset+myIdx.y] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.y];
+		ldsLocalHistogram[ldsOffset+myIdx.x-(1<<BITS_PER_PASS)] = 0;
+		ldsLocalHistogram[ldsOffset+myIdx.y-(1<<BITS_PER_PASS)] = 0;
+
+		int idx = ldsOffset+2*lIdx;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-1];
+		GROUP_MEM_FENCE;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-2];
+		GROUP_MEM_FENCE;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-4];
+		GROUP_MEM_FENCE;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-8];
+		GROUP_MEM_FENCE;
+
+		// Propagate intermediate values through
+		ldsLocalHistogram[idx-1] += ldsLocalHistogram[idx-2];
+		GROUP_MEM_FENCE;
+
+		// Grab and propagate for whole WG - loading the - 1 value
+		uint2 localValues;
+		localValues.x = ldsLocalHistogram[ldsOffset+myIdx.x-1];
+		localValues.y = ldsLocalHistogram[ldsOffset+myIdx.y-1];
+
+		ldsLocalHistogram[myIdx.x] = localValues.x;
+		ldsLocalHistogram[myIdx.y] = localValues.y;
+
+
+		ldsGlobalHistogram[myIdx.x] = histogramGlobalRadixMajor[m_numGroups*myIdx.x + wgIdx];
+		ldsGlobalHistogram[myIdx.y] = histogramGlobalRadixMajor[m_numGroups*myIdx.y + wgIdx];
+	}
+
+	GROUP_LDS_BARRIER;
+
+    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
+
+	SortData sortData[4];
+	{
+	    uint4 globalAddr = wgIdx*WG_SIZE*NUM_PER_WI + localAddr;
+		sortData[0] = src[globalAddr.x];
+		sortData[1] = src[globalAddr.y];
+		sortData[2] = src[globalAddr.z];
+		sortData[3] = src[globalAddr.w];
+	}
+
+	uint cmpValue = ((1<<BITS_PER_PASS)-1);
+	uint4 radix = make_uint4( (sortData[0].m_key>>m_startBit)&cmpValue, (sortData[1].m_key>>m_startBit)&cmpValue, 
+		(sortData[2].m_key>>m_startBit)&cmpValue, (sortData[3].m_key>>m_startBit)&cmpValue );;
+
+	//	data is already sorted. So simply subtract local prefix sum
+	uint4 dstAddr;
+	dstAddr.x = ldsGlobalHistogram[radix.x] + (localAddr.x - ldsLocalHistogram[radix.x]);
+	dstAddr.y = ldsGlobalHistogram[radix.y] + (localAddr.y - ldsLocalHistogram[radix.y]);
+	dstAddr.z = ldsGlobalHistogram[radix.z] + (localAddr.z - ldsLocalHistogram[radix.z]);
+	dstAddr.w = ldsGlobalHistogram[radix.w] + (localAddr.w - ldsLocalHistogram[radix.w]);
+
+	dst[dstAddr.x] = sortData[0];
+	dst[dstAddr.y] = sortData[1];
+	dst[dstAddr.z] = sortData[2];
+	dst[dstAddr.w] = sortData[3];
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void CopyKernel( DEFAULT_ARGS )
+{
+	dst[ GET_GLOBAL_IDX ] = src[ GET_GLOBAL_IDX ];
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernelsCL.h
@@ -0,0 +1,347 @@
+static const char* radixSortStandardKernelsCL= \
+"/*\n"
+"Bullet Continuous Collision Detection and Physics Library\n"
+"Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org\n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Author Takahiro Harada\n"
+"\n"
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"\n"
+"#define make_uint4 (uint4)\n"
+"#define make_uint2 (uint2)\n"
+"\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"\n"
+"#define WG_SIZE 128\n"
+"#define NUM_PER_WI 4\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_startBit;\n"
+"	u32 m_numGroups;\n"
+"	u32 m_padding[2];\n"
+"} ConstBuffer;\n"
+"\n"
+"#define BITS_PER_PASS 4\n"
+"\n"
+"\n"
+"\n"
+"uint4 prefixScanVector( uint4 data )\n"
+"{\n"
+"	data.y += data.x;\n"
+"	data.w += data.z;\n"
+"	data.z += data.y;\n"
+"	data.w += data.y;\n"
+"	return data;\n"
+"}\n"
+"\n"
+"uint prefixScanVectorEx( uint4* data )\n"
+"{\n"
+"	uint4 backup = data[0];\n"
+"	data[0].y += data[0].x;\n"
+"	data[0].w += data[0].z;\n"
+"	data[0].z += data[0].y;\n"
+"	data[0].w += data[0].y;\n"
+"	uint sum = data[0].w;\n"
+"	*data -= backup;\n"
+"	return sum;\n"
+"}\n"
+"\n"
+"uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32 sorterSharedMemory[] )\n"
+"{\n"
+"	{	//	Set data\n"
+"		sorterSharedMemory[lIdx] = 0;\n"
+"		sorterSharedMemory[lIdx+WG_SIZE] = prefixScanVectorEx( &pData );\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	Prefix sum\n"
+"		int idx = 2*lIdx + (WG_SIZE+1);\n"
+"		if( lIdx < 64 )\n"
+"		{\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];					\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+"			GROUP_MEM_FENCE;\n"
+"\n"
+"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+"			GROUP_MEM_FENCE;\n"
+"		}\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	*totalSum = sorterSharedMemory[WG_SIZE*2-1];\n"
+"	uint addValue = sorterSharedMemory[lIdx+127];\n"
+"	return pData + make_uint4(addValue, addValue, addValue, addValue);\n"
+"}\n"
+"\n"
+"\n"
+"void generateHistogram(u32 lIdx, u32 wgIdx, \n"
+"		uint4 sortedData,\n"
+"		__local u32 *histogram)\n"
+"{\n"
+"    if( lIdx < (1<<BITS_PER_PASS) )\n"
+"    {\n"
+"    	histogram[lIdx] = 0;\n"
+"    }\n"
+"\n"
+"	int mask = ((1<<BITS_PER_PASS)-1);\n"
+"	uint4 keys = make_uint4( (sortedData.x)&mask, (sortedData.y)&mask, (sortedData.z)&mask, (sortedData.w)&mask );\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	AtomInc( histogram[keys.x] );\n"
+"	AtomInc( histogram[keys.y] );\n"
+"	AtomInc( histogram[keys.z] );\n"
+"	AtomInc( histogram[keys.w] );\n"
+"}\n"
+"\n"
+"//\n"
+"//\n"
+"//\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void LocalSortKernel(__global SortData* sortDataIn, \n"
+"						__global u32* ldsHistogramOut0,\n"
+"						__global u32* ldsHistogramOut1,\n"
+"						ConstBuffer cb)\n"
+"{\n"
+"\n"
+"	__local u32 ldsSortData[ WG_SIZE*NUM_PER_WI + 16 ];\n"
+"\n"
+"	int nElemsPerWG = WG_SIZE*NUM_PER_WI;\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"\n"
+"    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
+"\n"
+"\n"
+"	SortData sortData[NUM_PER_WI];\n"
+"\n"
+"	{\n"
+"		u32 offset = nElemsPerWG*wgIdx;\n"
+"		sortData[0] = sortDataIn[offset+localAddr.x];\n"
+"		sortData[1] = sortDataIn[offset+localAddr.y];\n"
+"		sortData[2] = sortDataIn[offset+localAddr.z];\n"
+"		sortData[3] = sortDataIn[offset+localAddr.w];\n"
+"	}\n"
+"\n"
+"	int bitIdx = cb.m_startBit;\n"
+"	do\n"
+"	{\n"
+"//	what is this?\n"
+"//		if( lIdx == wgSize-1 ) ldsSortData[256] = sortData[3].m_key;\n"
+"		u32 mask = (1<<bitIdx);\n"
+"		uint4 cmpResult = make_uint4( sortData[0].m_key & mask, sortData[1].m_key & mask, sortData[2].m_key & mask, sortData[3].m_key & mask );\n"
+"		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n"
+"		u32 total;\n"
+"		prefixSum = localPrefixSum128V( prefixSum, lIdx, &total, ldsSortData );\n"
+"\n"
+"		{\n"
+"			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n"
+"			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			ldsSortData[dstAddr.x] = sortData[0].m_key;\n"
+"			ldsSortData[dstAddr.y] = sortData[1].m_key;\n"
+"			ldsSortData[dstAddr.z] = sortData[2].m_key;\n"
+"			ldsSortData[dstAddr.w] = sortData[3].m_key;\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			sortData[0].m_key = ldsSortData[localAddr.x];\n"
+"			sortData[1].m_key = ldsSortData[localAddr.y];\n"
+"			sortData[2].m_key = ldsSortData[localAddr.z];\n"
+"			sortData[3].m_key = ldsSortData[localAddr.w];\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			ldsSortData[dstAddr.x] = sortData[0].m_value;\n"
+"			ldsSortData[dstAddr.y] = sortData[1].m_value;\n"
+"			ldsSortData[dstAddr.z] = sortData[2].m_value;\n"
+"			ldsSortData[dstAddr.w] = sortData[3].m_value;\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			sortData[0].m_value = ldsSortData[localAddr.x];\n"
+"			sortData[1].m_value = ldsSortData[localAddr.y];\n"
+"			sortData[2].m_value = ldsSortData[localAddr.z];\n"
+"			sortData[3].m_value = ldsSortData[localAddr.w];\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"		}\n"
+"		bitIdx ++;\n"
+"	}\n"
+"	while( bitIdx <(cb.m_startBit+BITS_PER_PASS) );\n"
+"\n"
+"	{	//	generate historgram\n"
+"		uint4 localKeys = make_uint4( sortData[0].m_key>>cb.m_startBit, sortData[1].m_key>>cb.m_startBit, \n"
+"			sortData[2].m_key>>cb.m_startBit, sortData[3].m_key>>cb.m_startBit );\n"
+"\n"
+"		generateHistogram( lIdx, wgIdx, localKeys, ldsSortData );\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		int nBins = (1<<BITS_PER_PASS);\n"
+"		if( lIdx < nBins )\n"
+"		{\n"
+"     		u32 histValues = ldsSortData[lIdx];\n"
+"\n"
+"     		u32 globalAddresses = nBins*wgIdx + lIdx;\n"
+"     		u32 globalAddressesRadixMajor = cb.m_numGroups*lIdx + wgIdx;\n"
+"		\n"
+"     		ldsHistogramOut0[globalAddressesRadixMajor] = histValues;\n"
+"     		ldsHistogramOut1[globalAddresses] = histValues;\n"
+"		}\n"
+"	}\n"
+"\n"
+"\n"
+"	{	//	write\n"
+"		u32 offset = nElemsPerWG*wgIdx;\n"
+"		uint4 dstAddr = make_uint4(offset+localAddr.x, offset+localAddr.y, offset+localAddr.z, offset+localAddr.w );\n"
+"\n"
+"		sortDataIn[ dstAddr.x + 0 ] = sortData[0];\n"
+"		sortDataIn[ dstAddr.x + 1 ] = sortData[1];\n"
+"		sortDataIn[ dstAddr.x + 2 ] = sortData[2];\n"
+"		sortDataIn[ dstAddr.x + 3 ] = sortData[3];\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void ScatterKernel(__global SortData *src,\n"
+"		__global u32 *histogramGlobalRadixMajor,\n"
+"		__global u32 *histogramLocalGroupMajor,\n"
+"		__global SortData *dst,\n"
+"		ConstBuffer cb)\n"
+"{\n"
+"	__local u32 sorterLocalMemory[3*(1<<BITS_PER_PASS)];\n"
+"	__local u32 *ldsLocalHistogram = sorterLocalMemory + (1<<BITS_PER_PASS);\n"
+"	__local u32 *ldsGlobalHistogram = sorterLocalMemory;\n"
+"\n"
+"\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 ldsOffset = (1<<BITS_PER_PASS);\n"
+"\n"
+"	//	load and prefix scan local histogram\n"
+"	if( lIdx < ((1<<BITS_PER_PASS)/2) )\n"
+"	{\n"
+"		uint2 myIdx = make_uint2(lIdx, lIdx+8);\n"
+"\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.x] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.x];\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.y] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.y];\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.x-(1<<BITS_PER_PASS)] = 0;\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.y-(1<<BITS_PER_PASS)] = 0;\n"
+"\n"
+"		int idx = ldsOffset+2*lIdx;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-1];\n"
+"		GROUP_MEM_FENCE;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-2];\n"
+"		GROUP_MEM_FENCE;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-4];\n"
+"		GROUP_MEM_FENCE;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-8];\n"
+"		GROUP_MEM_FENCE;\n"
+"\n"
+"		// Propagate intermediate values through\n"
+"		ldsLocalHistogram[idx-1] += ldsLocalHistogram[idx-2];\n"
+"		GROUP_MEM_FENCE;\n"
+"\n"
+"		// Grab and propagate for whole WG - loading the - 1 value\n"
+"		uint2 localValues;\n"
+"		localValues.x = ldsLocalHistogram[ldsOffset+myIdx.x-1];\n"
+"		localValues.y = ldsLocalHistogram[ldsOffset+myIdx.y-1];\n"
+"\n"
+"		ldsLocalHistogram[myIdx.x] = localValues.x;\n"
+"		ldsLocalHistogram[myIdx.y] = localValues.y;\n"
+"\n"
+"\n"
+"		ldsGlobalHistogram[myIdx.x] = histogramGlobalRadixMajor[cb.m_numGroups*myIdx.x + wgIdx];\n"
+"		ldsGlobalHistogram[myIdx.y] = histogramGlobalRadixMajor[cb.m_numGroups*myIdx.y + wgIdx];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
+"\n"
+"	SortData sortData[4];\n"
+"	{\n"
+"	    uint4 globalAddr = wgIdx*WG_SIZE*NUM_PER_WI + localAddr;\n"
+"		sortData[0] = src[globalAddr.x];\n"
+"		sortData[1] = src[globalAddr.y];\n"
+"		sortData[2] = src[globalAddr.z];\n"
+"		sortData[3] = src[globalAddr.w];\n"
+"	}\n"
+"\n"
+"	uint cmpValue = ((1<<BITS_PER_PASS)-1);\n"
+"	uint4 radix = make_uint4( (sortData[0].m_key>>cb.m_startBit)&cmpValue, (sortData[1].m_key>>cb.m_startBit)&cmpValue, \n"
+"		(sortData[2].m_key>>cb.m_startBit)&cmpValue, (sortData[3].m_key>>cb.m_startBit)&cmpValue );;\n"
+"\n"
+"	//	data is already sorted. So simply subtract local prefix sum\n"
+"	uint4 dstAddr;\n"
+"	dstAddr.x = ldsGlobalHistogram[radix.x] + (localAddr.x - ldsLocalHistogram[radix.x]);\n"
+"	dstAddr.y = ldsGlobalHistogram[radix.y] + (localAddr.y - ldsLocalHistogram[radix.y]);\n"
+"	dstAddr.z = ldsGlobalHistogram[radix.z] + (localAddr.z - ldsLocalHistogram[radix.z]);\n"
+"	dstAddr.w = ldsGlobalHistogram[radix.w] + (localAddr.w - ldsLocalHistogram[radix.w]);\n"
+"\n"
+"	dst[dstAddr.x] = sortData[0];\n"
+"	dst[dstAddr.y] = sortData[1];\n"
+"	dst[dstAddr.z] = sortData[2];\n"
+"	dst[dstAddr.w] = sortData[3];\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void CopyKernel(__global SortData *src, __global SortData *dst)\n"
+"{\n"
+"	dst[ GET_GLOBAL_IDX ] = src[ GET_GLOBAL_IDX ];\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernelsDX11.h
@@ -0,0 +1,324 @@
+static const char* radixSortStandardKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"#define GROUP_MEM_FENCE\n"
+"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+"\n"
+"#define make_uint4 uint4\n"
+"#define make_uint2 uint2\n"
+"\n"
+"uint4 SELECT_UINT4(uint4 b,uint4 a,uint4 condition ){ return  make_uint4( ((condition).x)?a.x:b.x, ((condition).y)?a.y:b.y, ((condition).z)?a.z:b.z, ((condition).w)?a.w:b.w ); }\n"
+"\n"
+"//	takahiro end\n"
+"#define WG_SIZE 128\n"
+"#define NUM_PER_WI 4\n"
+"\n"
+"#define GET_GROUP_SIZE WG_SIZE\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"cbuffer SortCB : register( b0 )\n"
+"{\n"
+"	u32 m_startBit;\n"
+"	u32 m_numGroups;\n"
+"	u32 m_padding[2];\n"
+"};\n"
+"\n"
+"#define BITS_PER_PASS 4\n"
+"\n"
+"\n"
+"uint4 prefixScanVector( uint4 data )\n"
+"{\n"
+"	data.y += data.x;\n"
+"	data.w += data.z;\n"
+"	data.z += data.y;\n"
+"	data.w += data.y;\n"
+"	return data;\n"
+"}\n"
+"\n"
+"uint prefixScanVectorEx( inout uint4 data )\n"
+"{\n"
+"	uint4 backup = data;\n"
+"	data.y += data.x;\n"
+"	data.w += data.z;\n"
+"	data.z += data.y;\n"
+"	data.w += data.y;\n"
+"	uint sum = data.w;\n"
+"	data -= backup;\n"
+"	return sum;\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"RWStructuredBuffer<SortData> sortDataIn : register( u0 );\n"
+"RWStructuredBuffer<u32> ldsHistogramOut0 : register( u1 );\n"
+"RWStructuredBuffer<u32> ldsHistogramOut1 : register( u2 );\n"
+"\n"
+"groupshared u32 ldsSortData[ WG_SIZE*NUM_PER_WI + 16 ];\n"
+"\n"
+"\n"
+"uint4 localPrefixSum128V( uint4 pData, uint lIdx, inout uint totalSum )\n"
+"{\n"
+"	{	//	Set data\n"
+"		ldsSortData[lIdx] = 0;\n"
+"		ldsSortData[lIdx+WG_SIZE] = prefixScanVectorEx( pData );\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	Prefix sum\n"
+"		int idx = 2*lIdx + (WG_SIZE+1);\n"
+"		if( lIdx < 64 )\n"
+"		{\n"
+"			ldsSortData[idx] += ldsSortData[idx-1];\n"
+"			GROUP_MEM_FENCE;\n"
+"			ldsSortData[idx] += ldsSortData[idx-2];					\n"
+"			GROUP_MEM_FENCE;\n"
+"			ldsSortData[idx] += ldsSortData[idx-4];\n"
+"			GROUP_MEM_FENCE;\n"
+"			ldsSortData[idx] += ldsSortData[idx-8];\n"
+"			GROUP_MEM_FENCE;\n"
+"			ldsSortData[idx] += ldsSortData[idx-16];\n"
+"			GROUP_MEM_FENCE;\n"
+"			ldsSortData[idx] += ldsSortData[idx-32];		\n"
+"			GROUP_MEM_FENCE;\n"
+"			ldsSortData[idx] += ldsSortData[idx-64];\n"
+"			GROUP_MEM_FENCE;\n"
+"\n"
+"			ldsSortData[idx-1] += ldsSortData[idx-2];\n"
+"			GROUP_MEM_FENCE;\n"
+"		}\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	totalSum = ldsSortData[WG_SIZE*2-1];\n"
+"	uint addValue = ldsSortData[lIdx+127];\n"
+"	return pData + make_uint4(addValue, addValue, addValue, addValue);\n"
+"}\n"
+"\n"
+"void generateHistogram(u32 lIdx, u32 wgIdx, \n"
+"		uint4 sortedData)\n"
+"{\n"
+"    if( lIdx < (1<<BITS_PER_PASS) )\n"
+"    {\n"
+"    	ldsSortData[lIdx] = 0;\n"
+"    }\n"
+"\n"
+"	int mask = ((1<<BITS_PER_PASS)-1);\n"
+"	uint4 keys = make_uint4( (sortedData.x)&mask, (sortedData.y)&mask, (sortedData.z)&mask, (sortedData.w)&mask );\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	AtomInc( ldsSortData[keys.x] );\n"
+"	AtomInc( ldsSortData[keys.y] );\n"
+"	AtomInc( ldsSortData[keys.z] );\n"
+"	AtomInc( ldsSortData[keys.w] );\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void LocalSortKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int nElemsPerWG = WG_SIZE*NUM_PER_WI;\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"\n"
+"    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
+"\n"
+"\n"
+"	SortData sortData[NUM_PER_WI];\n"
+"\n"
+"	{\n"
+"		u32 offset = nElemsPerWG*wgIdx;\n"
+"		sortData[0] = sortDataIn[offset+localAddr.x];\n"
+"		sortData[1] = sortDataIn[offset+localAddr.y];\n"
+"		sortData[2] = sortDataIn[offset+localAddr.z];\n"
+"		sortData[3] = sortDataIn[offset+localAddr.w];\n"
+"	}\n"
+"\n"
+"	int bitIdx = m_startBit;\n"
+"	do\n"
+"	{\n"
+"//	what is this?\n"
+"//		if( lIdx == wgSize-1 ) ldsSortData[256] = sortData[3].m_key;\n"
+"		u32 mask = (1<<bitIdx);\n"
+"		uint4 cmpResult = make_uint4( sortData[0].m_key & mask, sortData[1].m_key & mask, sortData[2].m_key & mask, sortData[3].m_key & mask );\n"
+"		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n"
+"		u32 total;\n"
+"		prefixSum = localPrefixSum128V( prefixSum, lIdx, total );\n"
+"\n"
+"		{\n"
+"			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n"
+"			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			ldsSortData[dstAddr.x] = sortData[0].m_key;\n"
+"			ldsSortData[dstAddr.y] = sortData[1].m_key;\n"
+"			ldsSortData[dstAddr.z] = sortData[2].m_key;\n"
+"			ldsSortData[dstAddr.w] = sortData[3].m_key;\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			sortData[0].m_key = ldsSortData[localAddr.x];\n"
+"			sortData[1].m_key = ldsSortData[localAddr.y];\n"
+"			sortData[2].m_key = ldsSortData[localAddr.z];\n"
+"			sortData[3].m_key = ldsSortData[localAddr.w];\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			ldsSortData[dstAddr.x] = sortData[0].m_value;\n"
+"			ldsSortData[dstAddr.y] = sortData[1].m_value;\n"
+"			ldsSortData[dstAddr.z] = sortData[2].m_value;\n"
+"			ldsSortData[dstAddr.w] = sortData[3].m_value;\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			sortData[0].m_value = ldsSortData[localAddr.x];\n"
+"			sortData[1].m_value = ldsSortData[localAddr.y];\n"
+"			sortData[2].m_value = ldsSortData[localAddr.z];\n"
+"			sortData[3].m_value = ldsSortData[localAddr.w];\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"		}\n"
+"		bitIdx ++;\n"
+"	}\n"
+"	while( bitIdx <(m_startBit+BITS_PER_PASS) );\n"
+"\n"
+"	{	//	generate historgram\n"
+"		uint4 localKeys = make_uint4( sortData[0].m_key>>m_startBit, sortData[1].m_key>>m_startBit, \n"
+"			sortData[2].m_key>>m_startBit, sortData[3].m_key>>m_startBit );\n"
+"\n"
+"		generateHistogram( lIdx, wgIdx, localKeys );\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		int nBins = (1<<BITS_PER_PASS);\n"
+"		if( lIdx < nBins )\n"
+"		{\n"
+"     		u32 histValues = ldsSortData[lIdx];\n"
+"\n"
+"     		u32 globalAddresses = nBins*wgIdx + lIdx;\n"
+"     		u32 globalAddressesRadixMajor = m_numGroups*lIdx + wgIdx;\n"
+"		\n"
+"     		ldsHistogramOut0[globalAddressesRadixMajor] = histValues;\n"
+"     		ldsHistogramOut1[globalAddresses] = histValues;\n"
+"		}\n"
+"	}\n"
+"\n"
+"	{	//	write\n"
+"		u32 offset = nElemsPerWG*wgIdx;\n"
+"		uint4 dstAddr = make_uint4(offset+localAddr.x, offset+localAddr.y, offset+localAddr.z, offset+localAddr.w );\n"
+"\n"
+"		sortDataIn[ dstAddr.x + 0 ] = sortData[0];\n"
+"		sortDataIn[ dstAddr.x + 1 ] = sortData[1];\n"
+"		sortDataIn[ dstAddr.x + 2 ] = sortData[2];\n"
+"		sortDataIn[ dstAddr.x + 3 ] = sortData[3];\n"
+"	}\n"
+"}\n"
+"\n"
+"StructuredBuffer<SortData> src : register( t0 );\n"
+"StructuredBuffer<u32> histogramGlobalRadixMajor : register( t1 );\n"
+"StructuredBuffer<u32> histogramLocalGroupMajor : register( t2 );\n"
+"\n"
+"RWStructuredBuffer<SortData> dst : register( u0 );\n"
+"\n"
+"groupshared u32 ldsLocalHistogram[ 2*(1<<BITS_PER_PASS) ];\n"
+"groupshared u32 ldsGlobalHistogram[ (1<<BITS_PER_PASS) ];\n"
+"\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void ScatterKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 ldsOffset = (1<<BITS_PER_PASS);\n"
+"\n"
+"	//	load and prefix scan local histogram\n"
+"	if( lIdx < ((1<<BITS_PER_PASS)/2) )\n"
+"	{\n"
+"		uint2 myIdx = make_uint2(lIdx, lIdx+8);\n"
+"\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.x] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.x];\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.y] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.y];\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.x-(1<<BITS_PER_PASS)] = 0;\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.y-(1<<BITS_PER_PASS)] = 0;\n"
+"\n"
+"		int idx = ldsOffset+2*lIdx;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-1];\n"
+"		GROUP_MEM_FENCE;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-2];\n"
+"		GROUP_MEM_FENCE;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-4];\n"
+"		GROUP_MEM_FENCE;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-8];\n"
+"		GROUP_MEM_FENCE;\n"
+"\n"
+"		// Propagate intermediate values through\n"
+"		ldsLocalHistogram[idx-1] += ldsLocalHistogram[idx-2];\n"
+"		GROUP_MEM_FENCE;\n"
+"\n"
+"		// Grab and propagate for whole WG - loading the - 1 value\n"
+"		uint2 localValues;\n"
+"		localValues.x = ldsLocalHistogram[ldsOffset+myIdx.x-1];\n"
+"		localValues.y = ldsLocalHistogram[ldsOffset+myIdx.y-1];\n"
+"\n"
+"		ldsLocalHistogram[myIdx.x] = localValues.x;\n"
+"		ldsLocalHistogram[myIdx.y] = localValues.y;\n"
+"\n"
+"\n"
+"		ldsGlobalHistogram[myIdx.x] = histogramGlobalRadixMajor[m_numGroups*myIdx.x + wgIdx];\n"
+"		ldsGlobalHistogram[myIdx.y] = histogramGlobalRadixMajor[m_numGroups*myIdx.y + wgIdx];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
+"\n"
+"	SortData sortData[4];\n"
+"	{\n"
+"	    uint4 globalAddr = wgIdx*WG_SIZE*NUM_PER_WI + localAddr;\n"
+"		sortData[0] = src[globalAddr.x];\n"
+"		sortData[1] = src[globalAddr.y];\n"
+"		sortData[2] = src[globalAddr.z];\n"
+"		sortData[3] = src[globalAddr.w];\n"
+"	}\n"
+"\n"
+"	uint cmpValue = ((1<<BITS_PER_PASS)-1);\n"
+"	uint4 radix = make_uint4( (sortData[0].m_key>>m_startBit)&cmpValue, (sortData[1].m_key>>m_startBit)&cmpValue, \n"
+"		(sortData[2].m_key>>m_startBit)&cmpValue, (sortData[3].m_key>>m_startBit)&cmpValue );;\n"
+"\n"
+"	//	data is already sorted. So simply subtract local prefix sum\n"
+"	uint4 dstAddr;\n"
+"	dstAddr.x = ldsGlobalHistogram[radix.x] + (localAddr.x - ldsLocalHistogram[radix.x]);\n"
+"	dstAddr.y = ldsGlobalHistogram[radix.y] + (localAddr.y - ldsLocalHistogram[radix.y]);\n"
+"	dstAddr.z = ldsGlobalHistogram[radix.z] + (localAddr.z - ldsLocalHistogram[radix.z]);\n"
+"	dstAddr.w = ldsGlobalHistogram[radix.w] + (localAddr.w - ldsLocalHistogram[radix.w]);\n"
+"\n"
+"	dst[dstAddr.x] = sortData[0];\n"
+"	dst[dstAddr.y] = sortData[1];\n"
+"	dst[dstAddr.z] = sortData[2];\n"
+"	dst[dstAddr.w] = sortData[3];\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void CopyKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	dst[ GET_GLOBAL_IDX ] = src[ GET_GLOBAL_IDX ];\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/SortData.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/SortData.h
@@ -0,0 +1,31 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#pragma once
+
+#include <AdlPrimitives/Math/Math.h>
+
+namespace adl
+{
+
+struct SortData
+{
+	SortData(){}
+	SortData( u32 key, u32 value ) : m_key(key), m_value(value) {}
+
+	union
+	{
+		u32 m_key;
+		struct { u16 m_key16[2]; };
+	};
+	u32 m_value;
+
+	friend bool operator <(const SortData& a, const SortData& b)
+	{
+		return a.m_key < b.m_key;
+	}
+};
+
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/radixsortadvanced.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/radixsortadvanced.inl
@@ -0,0 +1,146 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#define PATH "..\\..\\AdlPrimitives\\Sort\\RadixSortAdvancedKernels"
+#define KERNEL0 "StreamCountKernel"
+#define KERNEL1 "SortAndScatterKernel1"
+#define KERNEL2 "PrefixScanKernel"
+
+template<DeviceType type>
+class RadixSortAdvanced : public RadixSortBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		enum
+		{
+			WG_SIZE = 128,
+			NUM_PER_WI = 4,
+			MAX_NUM_WORKGROUPS = 60,
+		};
+
+		struct Data : public RadixSort<type>::Data
+		{
+			Kernel* m_localCountKernel;
+			Kernel* m_scatterKernel;
+			Kernel* m_scanKernel;
+
+			Buffer<u32>* m_workBuffer0;
+			Buffer<SortData>* m_workBuffer1;
+			Buffer<int4>* m_constBuffer[32/4];
+		};
+		
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_NORMAL);
+
+		static
+		void deallocate(void* data);
+
+		static
+		void execute(void* data, Buffer<SortData>& inout, int n, int sortBits);
+};
+
+template<DeviceType type>
+typename RadixSortAdvanced<type>::Data* RadixSortAdvanced<type>::allocate(const Device* deviceData, int maxSize, Option option)
+{
+	ADLASSERT( type == deviceData->m_type );
+
+	const char* src[] = { 0, 0, 0 };
+
+	Data* data = new Data;
+	data->m_option = option;
+	data->m_deviceData = deviceData;
+
+	data->m_localCountKernel = deviceData->getKernel( PATH, KERNEL0, 0, src[type] );
+	data->m_scatterKernel = deviceData->getKernel( PATH, KERNEL1, 0, src[type] );
+	data->m_scanKernel = deviceData->getKernel( PATH, KERNEL2, 0, src[type] );
+
+	data->m_workBuffer0 = new Buffer<u32>( deviceData, MAX_NUM_WORKGROUPS*16 );
+	data->m_workBuffer1 = new Buffer<SortData>( deviceData, maxSize );
+	for(int i=0; i<32/4; i++)
+		data->m_constBuffer[i] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
+	data->m_maxSize = maxSize;
+
+	return data;
+}
+
+template<DeviceType type>
+void RadixSortAdvanced<type>::deallocate(void* rawData)
+{
+	Data* data = (Data*)rawData;
+
+	delete data->m_workBuffer0;
+	delete data->m_workBuffer1;
+	for(int i=0; i<32/4; i++)
+		delete data->m_constBuffer[i];
+	
+	delete data;
+}
+
+template<DeviceType type>
+void RadixSortAdvanced<type>::execute(void* rawData, Buffer<SortData>& inout, int n, int sortBits)
+{
+	Data* data = (Data*)rawData;
+
+	ADLASSERT( sortBits == 32 );
+
+	ADLASSERT( NUM_PER_WI == 4 );
+	ADLASSERT( n%(WG_SIZE*NUM_PER_WI) == 0 );
+	ADLASSERT( MAX_NUM_WORKGROUPS < 128*8/16 );
+
+	Buffer<SortData>* src = &inout;
+	Buffer<SortData>* dst = data->m_workBuffer1;
+
+	const Device* deviceData = data->m_deviceData;
+
+	int nBlocks = n/(NUM_PER_WI*WG_SIZE);
+	const int nWorkGroupsToExecute = min2((int)MAX_NUM_WORKGROUPS, nBlocks);
+	int nBlocksPerGroup = (nBlocks+nWorkGroupsToExecute-1)/nWorkGroupsToExecute;
+	ADLASSERT( nWorkGroupsToExecute <= MAX_NUM_WORKGROUPS );
+
+	int4 constBuffer = make_int4(0, nBlocks, nWorkGroupsToExecute, nBlocksPerGroup);
+
+	int iPass = 0;
+	int startBit = 0;
+	for(int startBit=0; startBit<32; startBit+=4, iPass++)
+	{
+		constBuffer.x = startBit;
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( data->m_workBuffer0 ) };
+
+			Launcher launcher( deviceData, data->m_localCountKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
+			launcher.launch1D( WG_SIZE* nWorkGroupsToExecute, WG_SIZE );
+		}
+
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( data->m_workBuffer0 ) };
+
+			Launcher launcher( deviceData, data->m_scanKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
+			launcher.launch1D( WG_SIZE, WG_SIZE );
+		}
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( data->m_workBuffer0, true ), BufferInfo( src ), BufferInfo( dst ) };
+
+			Launcher launcher( deviceData, data->m_scatterKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
+			launcher.launch1D( WG_SIZE*nWorkGroupsToExecute, WG_SIZE );
+		}
+
+		swap2( src, dst );
+	}
+}
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/radixsortsimple.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/radixsortsimple.inl
@@ -0,0 +1,149 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Sort\\RadixSortSimpleKernels"
+#define KERNEL0 "LocalCountKernel"
+#define KERNEL1 "ScatterKernel"
+
+#include <AdlPrimitives/Sort/RadixSortSimpleCL.h>
+#include <AdlPrimitives/Sort/RadixSortSimpleDX11.h>
+
+template<DeviceType type>
+class RadixSortSimple : public RadixSortBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		enum
+		{
+			WG_SIZE = 128,
+			NUM_PER_WI = 4,
+		};
+
+		struct Data : public RadixSort<type>::Data
+		{
+			Kernel* m_localCountKernel;
+			Kernel* m_scatterKernel;
+
+			Buffer<u32>* m_workBuffer0;
+			Buffer<u32>* m_workBuffer1;
+			Buffer<SortData>* m_workBuffer2;
+			Buffer<int4>* m_constBuffer[4];
+		};
+		
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_NORMAL);
+
+		static
+		void deallocate(void* data);
+
+		static
+		void execute(void* data, Buffer<SortData>& inout, int n, int sortBits);
+};
+
+template<DeviceType type>
+typename RadixSortSimple<type>::Data* RadixSortSimple<type>::allocate(const Device* deviceData, int maxSize, Option option)
+{
+	ADLASSERT( type == deviceData->m_type );
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{radixSortSimpleKernelsCL, radixSortSimpleKernelsDX11};
+#else
+		{ 0, 0 };
+#endif
+	u32 maxNumGroups = (maxSize+WG_SIZE*NUM_PER_WI-1)/(WG_SIZE*NUM_PER_WI);
+
+	Data* data = new Data;
+	data->m_option = option;
+	data->m_deviceData = deviceData;
+
+	data->m_localCountKernel = deviceData->getKernel( PATH, KERNEL0, 0, src[type] );
+	data->m_scatterKernel = deviceData->getKernel( PATH, KERNEL1, 0, src[type] );
+
+	data->m_scanData = PrefixScan<type>::allocate( deviceData, maxSize );
+
+	data->m_workBuffer0 = new Buffer<u32>( deviceData, maxNumGroups*256 );
+	data->m_workBuffer1 = new Buffer<u32>( deviceData, maxNumGroups*256 );
+	data->m_workBuffer2 = new Buffer<SortData>( deviceData, maxSize );
+	data->m_constBuffer[0] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
+	data->m_constBuffer[1] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
+	data->m_constBuffer[2] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
+	data->m_constBuffer[3] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
+	data->m_maxSize = maxSize;
+
+	return data;
+}
+
+template<DeviceType type>
+void RadixSortSimple<type>::deallocate(void* rawData)
+{
+	Data* data = (Data*)rawData;
+
+	delete data->m_workBuffer0;
+	delete data->m_workBuffer1;
+	delete data->m_workBuffer2;
+	delete data->m_constBuffer[0];
+	delete data->m_constBuffer[1];
+	delete data->m_constBuffer[2];
+	delete data->m_constBuffer[3];
+	
+	PrefixScan<type>::deallocate( data->m_scanData );
+
+	delete data;
+}
+
+template<DeviceType type>
+void RadixSortSimple<type>::execute(void* rawData, Buffer<SortData>& inout, int n, int sortBits)
+{
+	Data* data = (Data*)rawData;
+
+	ADLASSERT( sortBits == 32 );
+	ADLASSERT( n%512 == 0 );
+	ADLASSERT( n <= data->m_maxSize );
+
+	Buffer<SortData>* src = &inout;
+	Buffer<SortData>* dst = data->m_workBuffer2;
+
+	const Device* deviceData = data->m_deviceData;
+
+	int numGroups = (n+WG_SIZE*NUM_PER_WI-1)/(WG_SIZE*NUM_PER_WI);
+
+	int4 constBuffer;
+
+	int iPass = 0;
+	for(int startBit=0; startBit<32; startBit+=8, iPass++)
+	{
+		constBuffer.x = startBit;
+		constBuffer.y = numGroups;
+		constBuffer.z = WG_SIZE;
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( data->m_workBuffer0 ) };
+
+			Launcher launcher( deviceData, data->m_localCountKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
+			launcher.launch1D( WG_SIZE*numGroups, WG_SIZE );
+		}
+
+		PrefixScan<type>::execute( data->m_scanData, *data->m_workBuffer0, *data->m_workBuffer1, numGroups*256 );
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( dst ), BufferInfo( data->m_workBuffer1 ) };
+
+			Launcher launcher( deviceData, data->m_scatterKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
+			launcher.launch1D( WG_SIZE*numGroups, WG_SIZE );
+		}
+
+		swap2( src, dst );
+	}
+}
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/stringify.py
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/stringify.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+import sys
+import os
+import shutil
+
+arg = sys.argv[1]
+fh = open(arg)
+	
+print 'static const char* '+sys.argv[2]+'= \\'
+for line in fh.readlines():
+	a = line.strip('\n')
+	print '"'+a+'\\n"'
+print ';'
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/stringifykernels.bat
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/stringifykernels.bat
@@ -0,0 +1,22 @@
+stringify.py Fill/FillKernels.cl fillKernelsCL >Fill/FillKernelsCL.h
+stringify.py Fill/FillKernels.hlsl fillKernelsDX11 >Fill/FillKernelsDX11.h
+stringify.py Scan/PrefixScanKernels.cl prefixScanKernelsCL >Scan/PrefixScanKernelsCL.h
+stringify.py Scan/PrefixScanKernels.hlsl prefixScanKernelsDX11 >Scan/PrefixScanKernelsDX11.h
+stringify.py Search/BoundSearchKernels.cl boundSearchKernelsCL >Search/BoundSearchKernelsCL.h
+stringify.py Search/BoundSearchKernels.hlsl boundSearchKernelsDX11 >Search/BoundSearchKernelsDX11.h
+stringify.py Sort/RadixSortSimpleKernels.cl radixSortSimpleKernelsCL >Sort/RadixSortSimpleKernelsCL.h
+stringify.py Sort/RadixSortSimpleKernels.hlsl radixSortSimpleKernelsDX11 >Sort/RadixSortSimpleKernelsDX11.h
+stringify.py Sort/RadixSortStandardKernels.cl radixSortStandardKernelsCL >Sort/RadixSortStandardKernelsCL.h
+
+stringify.py Sort/RadixSort32Kernels.cl radixSort32KernelsCL >Sort/RadixSort32KernelsCL.h
+stringify.py Sort/RadixSort32Kernels.hlsl radixSort32KernelsDX11 >Sort/RadixSort32KernelsDX11.h
+
+stringify.py Copy/CopyKernels.cl copyKernelsCL >Copy/CopyKernelsCL.h
+stringify.py Copy/CopyKernels.hlsl copyKernelsDX11 >Copy/CopyKernelsDX11.h
+
+stringify.py Sort/RadixSortStandardKernels.hlsl radixSortStandardKernelsDX11 >Sort/RadixSortStandardKernelsDX11.h
+stringify.py Sort/RadixSortAdvancedKernels.hlsl radixSortAdvancedKernelsDX11 >Sort/RadixSortAdvancedKernelsDX11.h
+
+
+
+