Add the GPU rigid body pipeline from https://github.com/erwincoumans/experiments as a Bullet 3.x preview for Bullet 2.80

2012-03-05 00:54:32 +00:00
parent 73c4646b40
commit 571af41cf6
257 changed files with 55106 additions and 0 deletions
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.cpp
@@ -0,0 +1,19 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#include <Adl/Adl.h>
+
+//KernelManager* KernelManager::s_kManager = NULL;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.h
@@ -0,0 +1,235 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_H
+#define ADL_H
+
+#pragma warning( disable : 4996 )
+#include <Adl/AdlConfig.h>
+#include <Adl/AdlError.h>
+#include <algorithm>
+
+#ifndef max
+#define max(a,b)            (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b)            (((a) < (b)) ? (a) : (b))
+#endif
+
+namespace adl
+{
+
+enum DeviceType
+{
+	TYPE_CL = 0,
+	TYPE_DX11 = 1,
+	TYPE_HOST,
+};
+
+
+struct Device;
+
+struct BufferBase
+{
+	enum BufferType
+	{
+		BUFFER,
+
+		//	for dx
+		BUFFER_CONST,
+		BUFFER_STAGING,
+		BUFFER_APPEND,
+		BUFFER_RAW,
+		BUFFER_W_COUNTER,
+		BUFFER_INDEX,
+		BUFFER_VERTEX,
+
+		//	for cl
+		BUFFER_ZERO_COPY,
+
+	};
+};
+
+class DeviceUtils
+{
+	public:
+		struct Config
+		{
+			enum DeviceType
+			{
+				DEVICE_GPU,
+				DEVICE_CPU,
+			};
+
+			//	for CL
+			enum DeviceVendor
+			{
+				VD_AMD,
+				VD_INTEL,
+				VD_NV,
+			};
+
+			Config() : m_type(DEVICE_GPU), m_deviceIdx(0), m_vendor(VD_AMD){}
+
+			DeviceType m_type;
+			int m_deviceIdx;
+			DeviceVendor m_vendor;
+		};
+
+		__inline
+		static
+		int getNDevices( DeviceType type );
+		__inline
+		static Device* allocate( DeviceType type, Config& cfg );
+		__inline
+		static void deallocate( Device* deviceData );
+		__inline
+		static void waitForCompletion( const Device* deviceData );
+};
+
+//==========================
+//	DeviceData
+//==========================
+struct Kernel;
+
+struct Device
+{
+	typedef DeviceUtils::Config Config;
+
+	Device( DeviceType type ) : m_type( type ), m_memoryUsage(0)
+	{
+	}
+
+	virtual void* getContext() const { return 0; }
+	virtual void initialize(const Config& cfg){}
+	virtual void release(){}
+	virtual void waitForCompletion() const {}
+	virtual void getDeviceName( char nameOut[128] ) const {}
+	virtual Kernel* getKernel(const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL, bool cacheKernel = true ) const { ADLASSERT(0); return 0;}
+	virtual unsigned int getUsedMemory() const { return m_memoryUsage; }
+
+	DeviceType m_type;
+	unsigned int m_memoryUsage;
+};
+
+//==========================
+//	Buffer
+//==========================
+
+template<typename T>
+struct HostBuffer;
+//	overload each deviceDatas
+template<typename T>
+struct Buffer : public BufferBase
+{
+	__inline
+	Buffer();
+	__inline
+	Buffer(const Device* device, int nElems, BufferType type = BUFFER );
+	__inline
+	virtual ~Buffer();
+	
+	__inline
+	void setRawPtr( const Device* device, T* ptr, int size, BufferType type = BUFFER );
+	__inline
+	void allocate(const Device* device, int nElems, BufferType type = BUFFER );
+	__inline
+	void write(T* hostSrcPtr, int nElems, int dstOffsetNElems = 0);
+	__inline
+	void read(T* hostDstPtr, int nElems, int srcOffsetNElems = 0) const;
+	__inline
+	void write(Buffer<T>& src, int nElems);
+	__inline
+	void read(Buffer<T>& dst, int nElems) const;
+//	__inline
+//	Buffer<T>& operator = (const Buffer<T>& buffer);
+	__inline
+	int getSize() const { return m_size; }
+
+	DeviceType getType() const { ADLASSERT( m_device ); return m_device->m_type; }
+
+
+	const Device* m_device;
+	int m_size;
+	T* m_ptr;
+	//	for DX11
+	void* m_uav;
+	void* m_srv;
+	bool m_allocated;	//	todo. move this to a bit
+};
+
+class BufferUtils
+{
+public:
+	template<DeviceType TYPE, bool COPY, typename T>
+	__inline
+	static
+	typename Buffer<T>* map(const Device* device, const Buffer<T>* in, int copySize = -1);
+
+	template<bool COPY, typename T>
+	__inline
+	static
+	void unmap( Buffer<T>* native, const Buffer<T>* orig, int copySize = -1 );
+};
+
+//==========================
+//	HostBuffer
+//==========================
+struct DeviceHost;
+
+template<typename T>
+struct HostBuffer : public Buffer<T>
+{
+	__inline
+	HostBuffer():Buffer<T>(){}
+	__inline
+	HostBuffer(const Device* device, int nElems, BufferType type = BUFFER ) : Buffer<T>(device, nElems, type) {}
+//	HostBuffer(const Device* deviceData, T* rawPtr, int nElems);
+
+
+	__inline
+	T& operator[](int idx);
+	__inline
+	const T& operator[](int idx) const;
+	__inline
+	T* begin() { return m_ptr; }
+
+	__inline
+	HostBuffer<T>& operator = (const Buffer<T>& device);
+};
+
+};
+
+#include <Adl/AdlKernel.h>
+#if defined(ADL_ENABLE_CL)
+	#include <Adl/CL/AdlCL.inl>
+#endif
+#if defined(ADL_ENABLE_DX11)
+	#include <Adl/DX11/AdlDX11.inl>
+#endif
+
+#include <Adl/Host/AdlHost.inl>
+#include <Adl/AdlKernel.inl>
+#include <Adl/Adl.inl>
+
+
+#include <Adl/AdlStopwatch.h>
+
+#include <Adl/Host/AdlStopwatchHost.inl>
+#include <Adl/AdlStopwatch.inl>
+
+#endif
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.inl
@@ -0,0 +1,344 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+namespace adl
+{
+
+int DeviceUtils::getNDevices( DeviceType type )
+{
+	switch( type )
+	{
+#if defined(ADL_ENABLE_CL)
+	case TYPE_CL:
+		return DeviceCL::getNDevices();
+#endif
+#if defined(ADL_ENABLE_DX11)
+	case TYPE_DX11:
+		return DeviceDX11::getNDevices();
+#endif
+	default:
+		return 1;
+	};
+}
+
+Device* DeviceUtils::allocate( DeviceType type, Config& cfg )
+{
+	Device* deviceData;
+	switch( type )
+	{
+#if defined(ADL_ENABLE_CL)
+	case TYPE_CL:
+		deviceData = new DeviceCL();
+		break;
+#endif
+#if defined(ADL_ENABLE_DX11)
+	case TYPE_DX11:
+		deviceData = new DeviceDX11();
+		break;
+#endif
+	case TYPE_HOST:
+		deviceData = new DeviceHost();
+		break;
+	default:
+		ADLASSERT( 0 );
+		break;
+	};
+	deviceData->initialize( cfg );
+	return deviceData;
+}
+
+void DeviceUtils::deallocate( Device* deviceData )
+{
+	ADLASSERT( deviceData->getUsedMemory() == 0 );
+	deviceData->release();
+	delete deviceData;
+}
+
+void DeviceUtils::waitForCompletion( const Device* deviceData )
+{
+	deviceData->waitForCompletion();
+}
+
+#if defined(ADL_ENABLE_DX11)
+	#if defined(ADL_ENABLE_CL)
+	#define SELECT_DEVICEDATA( type, func ) \
+		switch( type ) \
+		{ \
+		case TYPE_CL: ((DeviceCL*)m_device)->func; break; \
+		case TYPE_DX11: ((DeviceDX11*)m_device)->func; break; \
+		case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+
+	#define SELECT_DEVICEDATA1( deviceData, func ) \
+		switch( deviceData->m_type ) \
+		{ \
+		case TYPE_CL: ((DeviceCL*)deviceData)->func; break; \
+		case TYPE_DX11: ((DeviceDX11*)deviceData)->func; break; \
+		case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+	#else
+	#define SELECT_DEVICEDATA( type, func ) \
+		switch( type ) \
+		{ \
+		case TYPE_DX11: ((DeviceDX11*)m_device)->func; break; \
+		case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+
+	#define SELECT_DEVICEDATA1( deviceData, func ) \
+		switch( deviceData->m_type ) \
+		{ \
+		case TYPE_DX11: ((DeviceDX11*)deviceData)->func; break; \
+		case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+	#endif
+#else
+	#if defined(ADL_ENABLE_CL)
+	#define SELECT_DEVICEDATA( type, func ) \
+		switch( type ) \
+		{ \
+		case TYPE_CL: ((DeviceCL*)m_device)->func; break; \
+		case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+
+	#define SELECT_DEVICEDATA1( deviceData, func ) \
+		switch( deviceData->m_type ) \
+		{ \
+		case TYPE_CL: ((DeviceCL*)deviceData)->func; break; \
+		case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+	#else
+	#define SELECT_DEVICEDATA( type, func ) \
+		switch( type ) \
+		{ \
+		case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+
+	#define SELECT_DEVICEDATA1( deviceData, func ) \
+		switch( deviceData->m_type ) \
+		{ \
+		case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+	#endif
+#endif
+
+template<typename T>
+Buffer<T>::Buffer()
+{
+	m_device = 0;
+	m_size = 0;
+	m_ptr = 0;
+
+	m_uav = 0;
+	m_srv = 0;
+
+	m_allocated = false;
+}
+
+template<typename T>
+Buffer<T>::Buffer(const Device* deviceData, int nElems, BufferType type )
+{
+	m_device = 0;
+	allocate( deviceData, nElems, type );
+}
+
+template<typename T>
+Buffer<T>::~Buffer()
+{
+	if( m_allocated )
+	{
+		if( m_device )
+			SELECT_DEVICEDATA( m_device->m_type, deallocate( this ) );
+	}
+
+	m_device = 0;
+	m_ptr = 0;
+	m_size = 0;
+}
+
+template<typename T>
+void Buffer<T>::setRawPtr( const Device* device, T* ptr, int size, BufferType type )
+{
+	ADLASSERT( m_device == 0 );
+	ADLASSERT( type == BUFFER );	//	todo. implement
+	ADLASSERT( device->m_type != TYPE_DX11 );	//	todo. implement set srv, uav
+
+	m_device = device;
+	m_ptr = ptr;
+	m_size = size;
+}
+
+template<typename T>
+void Buffer<T>::allocate(const Device* deviceData, int nElems, BufferType type )
+{
+	ADLASSERT( m_device == 0 );
+	m_device = deviceData;
+	m_size = 0;
+	m_ptr = 0;
+
+	m_uav = 0;
+	m_srv = 0;
+
+	SELECT_DEVICEDATA( m_device->m_type, allocate( this, nElems, type ) );
+	m_allocated = true;
+}
+
+template<typename T>
+void Buffer<T>::write(T* hostPtr, int nElems, int offsetNElems)
+{
+	ADLASSERT( nElems+offsetNElems <= m_size );
+	SELECT_DEVICEDATA( m_device->m_type, copy(this, hostPtr, nElems, offsetNElems) );
+}
+
+template<typename T>
+void Buffer<T>::read(T* hostPtr, int nElems, int offsetNElems) const
+{
+	SELECT_DEVICEDATA( m_device->m_type, copy(hostPtr,this, nElems, offsetNElems) );
+}
+
+template<typename T>
+void Buffer<T>::write(Buffer<T>& src, int nElems)
+{
+	ADLASSERT( nElems <= m_size );
+	SELECT_DEVICEDATA( m_device->m_type, copy(this, &src, nElems) );
+}
+
+template<typename T>
+void Buffer<T>::read(Buffer<T>& dst, int nElems) const
+{
+	SELECT_DEVICEDATA( m_device->m_type, copy(&dst, this, nElems) );
+}
+/*
+template<typename T>
+Buffer<T>& Buffer<T>::operator = ( const Buffer<T>& buffer )
+{
+//	ADLASSERT( buffer.m_size <= m_size );
+
+	SELECT_DEVICEDATA( m_device->m_type, copy(this, &buffer, min2( m_size, buffer.m_size) ) );
+
+	return *this;
+}
+*/
+
+template<DeviceType TYPE, bool COPY, typename T>
+__inline
+static
+typename Buffer<T>* BufferUtils::map(const Device* device, const Buffer<T>* in, int copySize)
+{
+	Buffer<T>* native;
+	ADLASSERT( device->m_type == TYPE );
+
+	if( in->getType() == TYPE )
+		native = (Buffer<T>*)in;
+	else
+	{
+		ADLASSERT( copySize <= in->getSize() );
+		copySize = (copySize==-1)? in->getSize() : copySize;
+
+		native = new Buffer<T>( device, copySize );
+		if( COPY )
+		{
+			if( in->getType() == TYPE_HOST )
+				native->write( in->m_ptr, copySize );
+			else if( native->getType() == TYPE_HOST )
+			{
+				in->read( native->m_ptr, copySize );
+				DeviceUtils::waitForCompletion( in->m_device );
+			}
+			else
+			{
+				T* tmp = new T[copySize];
+				in->read( tmp, copySize );
+				DeviceUtils::waitForCompletion( in->m_device );
+				native->write( tmp, copySize );
+				DeviceUtils::waitForCompletion( native->m_device );
+				delete [] tmp;
+			}
+		}
+	}
+	return native;
+}
+
+template<bool COPY, typename T>
+__inline
+static
+void BufferUtils::unmap( Buffer<T>* native, const Buffer<T>* orig, int copySize )
+{
+	if( native != orig )
+	{
+		if( COPY ) 
+		{
+			copySize = (copySize==-1)? orig->getSize() : copySize;
+			ADLASSERT( copySize <= orig->getSize() );
+			if( orig->getType() == TYPE_HOST )
+			{
+				native->read( orig->m_ptr, copySize );
+				DeviceUtils::waitForCompletion( native->m_device );
+			}
+			else if( native->getType() == TYPE_HOST )
+			{
+				Buffer<T>* dst = (Buffer<T>*)orig;
+				dst->write( native->m_ptr, copySize );
+				DeviceUtils::waitForCompletion( dst->m_device );
+			}
+			else
+			{
+				T* tmp = new T[copySize];
+				native->read( tmp, copySize );
+				DeviceUtils::waitForCompletion( native->m_device );
+				Buffer<T>* dst = (Buffer<T>*)orig;
+				dst->write( tmp, copySize );
+				DeviceUtils::waitForCompletion( dst->m_device );
+				delete [] tmp;
+			}
+		}
+		delete native;
+	}
+}
+
+
+template<typename T>
+T& HostBuffer<T>::operator[](int idx)
+{
+	return m_ptr[idx];
+}
+
+template<typename T>
+const T& HostBuffer<T>::operator[](int idx) const
+{
+	return m_ptr[idx];
+}
+
+template<typename T>
+HostBuffer<T>& HostBuffer<T>::operator = ( const Buffer<T>& device )
+{
+	ADLASSERT( device.m_size <= m_size );
+
+	SELECT_DEVICEDATA1( device.m_device, copy( m_ptr, &device, device.m_size ) );
+
+	return *this;
+}
+
+#undef SELECT_DEVICEDATA
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlConfig.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlConfig.h
@@ -0,0 +1,27 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+//ADL_ENABLE_CL and ADL_ENABLE_DX11 can be set in the build system using C/C++ preprocessor defines
+//#define ADL_ENABLE_CL
+//#define ADL_ENABLE_DX11
+
+//#define ADL_CL_FORCE_UNCACHE_KERNEL
+#define ADL_CL_DUMP_MEMORY_LOG
+
+//load the kernels from string instead of loading them from file
+#define ADL_LOAD_KERNEL_FROM_STRING
+#define ADL_DUMP_DX11_ERROR
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlError.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlError.h
@@ -0,0 +1,80 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_ERROR_H
+#define ADL_ERROR_H
+
+#if defined(ADL_DUMP_DX11_ERROR)
+	#include <windows.h>
+#endif
+#ifdef _DEBUG
+	#include <assert.h>
+	#include <stdarg.h>
+	#include <stdio.h>
+#endif
+
+
+namespace adl
+{
+
+#ifdef _DEBUG
+	#define ADLASSERT(x) if(!(x)){__debugbreak(); }
+#else
+	#define ADLASSERT(x) if(x){}
+#endif
+
+#ifdef _DEBUG
+	#define COMPILE_TIME_ASSERT(x) {int compileTimeAssertFailed[x]; compileTimeAssertFailed[0];}
+#else
+	#define COMPILE_TIME_ASSERT(x)
+#endif
+
+#ifdef _DEBUG
+	__inline
+	void debugPrintf(const char *fmt, ...)
+	{
+		va_list arg;
+		va_start(arg, fmt);
+#if defined(ADL_DUMP_DX11_ERROR)
+		const int size = 1024*10;
+		char buf[size];
+		vsprintf_s( buf, size, fmt, arg );
+#ifdef UNICODE
+		WCHAR wbuf[size];
+		int sizeWide = MultiByteToWideChar(0,0,buf,-1,wbuf,0);
+		MultiByteToWideChar(0,0,buf,-1,wbuf,sizeWide);
+
+//		swprintf_s( wbuf, 256, L"%s", buf );
+		OutputDebugString( wbuf );
+#else
+		OutputDebugString( buf );
+#endif
+#else
+		vprintf(fmt, arg);
+#endif
+		va_end(arg);
+	}
+#else
+	__inline
+	void debugPrintf(const char *fmt, ...)
+	{
+	}
+#endif
+
+};
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlKernel.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlKernel.h
@@ -0,0 +1,142 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_KERNEL_H
+#define ADL_KERNEL_H
+
+#include <map>
+#include <string>
+#include <fstream>
+
+namespace adl
+{
+
+//==========================
+//	Kernel
+//==========================
+struct Kernel
+{
+	DeviceType m_type;
+	void* m_kernel;
+};
+
+//==========================
+//	KernelManager
+//==========================
+class KernelManager
+{
+	public:
+		typedef std::map<std::string, Kernel*> KMap;
+
+		__inline
+		~KernelManager();
+
+		__inline
+//		static
+		Kernel* query(const Device* dd, const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL,
+			bool cacheKernel = true);
+
+	public:
+		KMap m_map;
+};
+
+//==========================
+//	Launcher
+//==========================
+class Launcher
+{
+	public:
+		struct BufferInfo
+		{
+			BufferInfo(){}
+			template<typename T>
+			BufferInfo(Buffer<T>* buff, bool isReadOnly = false): m_buffer(buff), m_isReadOnly(isReadOnly){}
+
+			void* m_buffer;
+			bool m_isReadOnly;
+		};
+
+		__inline
+		Launcher(const Device* dd, char* fileName, char* funcName, char* option = NULL);
+		__inline
+		Launcher(const Device* dd, Kernel* kernel);
+		__inline
+		void setBuffers( BufferInfo* buffInfo, int n );
+		template<typename T>
+		__inline
+		void setConst( Buffer<T>& constBuff, const T& consts );
+		__inline
+		void launch1D( int numThreads, int localSize = 64 );
+		__inline
+		void launch2D( int numThreadsX, int numThreadsY, int localSizeX = 8, int localSizeY = 8 );
+
+	public:
+		enum
+		{
+			CONST_BUFFER_SIZE = 512,
+		};
+
+		const Device* m_deviceData;
+		Kernel* m_kernel;
+		int m_idx;
+		int m_idxRw;
+};
+
+template<DeviceType TYPE>
+class KernelBuilder
+{
+	public:
+
+		__inline
+		KernelBuilder(): m_ptr(0){}
+		
+		__inline
+		void setFromFile( const Device* deviceData, const char* fileName, const char* option = NULL, bool addExtension = false,
+			bool cacheKernel = true);
+
+		__inline
+		void setFromSrc( const Device* deviceData, const char* src, const char* option = NULL );
+
+		__inline
+		void setFromSrcCached( const Device* deviceData, const char* src, const char* fileName, const char* option );
+
+
+		__inline
+		void createKernel( const char* funcName, Kernel& kernelOut );
+
+		__inline
+		~KernelBuilder();
+		//	todo. implemement in kernel destructor?
+		__inline
+		static void deleteKernel( Kernel& kernel );
+
+	private:
+		enum
+		{
+			MAX_PATH_LENGTH = 260,
+		};
+		const Device* m_deviceData;
+#ifdef UNICODE
+		wchar_t m_path[MAX_PATH_LENGTH];
+#else
+		char m_path[MAX_PATH_LENGTH];
+#endif
+		void* m_ptr;
+};
+
+};
+
+#endif //ADL_KERNEL_H
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlKernel.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlKernel.inl
@@ -0,0 +1,223 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+#ifdef ADL_ENABLE_CL
+	#include <Adl/CL/AdlKernelUtilsCL.inl>
+#endif
+#ifdef ADL_ENABLE_DX11
+	#include <Adl/DX11/AdlKernelUtilsDX11.inl>
+#endif
+
+namespace adl
+{
+
+//==========================
+//	KernelManager
+//==========================
+Kernel* KernelManager::query(const Device* dd, const char* fileName, const char* funcName, const char* option, const char* src,
+	bool cacheKernel)
+{
+	printf("compiling kernel %s",funcName);
+	const int charSize = 1024*2;
+	KernelManager* s_kManager = this;
+
+	char fullFineName[charSize];
+	switch( dd->m_type )
+	{
+	case TYPE_CL:
+#if defined(ADL_ENABLE_CL)
+		sprintf_s(fullFineName,charSize,"%s.cl", fileName);
+		break;
+#endif
+#if defined(ADL_ENABLE_DX11)
+	case TYPE_DX11:
+		sprintf_s(fullFineName,charSize,"%s.hlsl", fileName);
+		break;
+#endif
+	default:
+		ADLASSERT(0);
+		break;
+	};
+
+	char mapName[charSize];
+	{
+		if( option )
+			sprintf_s(mapName, charSize, "%d%s%s%s", (int)dd->getContext(), fullFineName, funcName, option);
+		else
+			sprintf_s(mapName, charSize, "%d%s%s", (int)dd->getContext(), fullFineName, funcName);
+	}
+
+	std::string str(mapName);
+
+	KMap::iterator iter = s_kManager->m_map.find( str );
+
+	Kernel* kernelOut;
+	if( iter == s_kManager->m_map.end() )
+	{
+		kernelOut = new Kernel();
+
+		switch( dd->m_type )
+		{
+#if defined(ADL_ENABLE_CL)
+		case TYPE_CL:
+			{
+				KernelBuilder<TYPE_CL> builder;
+				if( src )
+					if (cacheKernel)
+					{
+						builder.setFromSrcCached( dd, src, fileName, option );
+					} else
+					{
+						builder.setFromSrc( dd, src, option );
+					}
+				else
+					builder.setFromFile( dd, fileName, option, true, cacheKernel );
+				builder.createKernel( funcName, *kernelOut );
+			}
+			break;
+#endif
+#if defined(ADL_ENABLE_DX11)
+		case TYPE_DX11:
+			{
+				KernelBuilder<TYPE_DX11> builder;
+				if( src )
+					builder.setFromSrc( dd, src, option );
+				else
+					builder.setFromFile( dd, fileName, option, true, cacheKernel );
+				builder.createKernel( funcName, *kernelOut );
+			}
+			break;
+#endif
+		default:
+			ADLASSERT(0);
+			break;
+		};
+		s_kManager->m_map.insert( KMap::value_type(str,kernelOut) );
+	}
+	else
+	{
+		kernelOut = iter->second;
+	}
+
+	printf(" ready\n");
+	return kernelOut;
+}
+
+KernelManager::~KernelManager()
+{
+	for(KMap::iterator iter = m_map.begin(); iter != m_map.end(); iter++)
+	{
+		Kernel* k = iter->second;
+		switch( k->m_type )
+		{
+#if defined(ADL_ENABLE_CL)
+		case TYPE_CL:
+			KernelBuilder<TYPE_CL>::deleteKernel( *k );
+			delete k;
+			break;
+#endif
+#if defined(ADL_ENABLE_DX11)
+		case TYPE_DX11:
+			KernelBuilder<TYPE_DX11>::deleteKernel( *k );
+			delete k;
+			break;
+#endif
+		default:
+			ADLASSERT(0);
+			break;
+		};
+	}
+}
+
+//==========================
+//	Launcher
+//==========================
+
+#if defined(ADL_ENABLE_DX11)
+	#if defined(ADL_ENABLE_CL)
+	#define SELECT_LAUNCHER( type, func ) \
+		switch( type ) \
+		{ \
+		case TYPE_CL: LauncherCL::func; break; \
+		case TYPE_DX11: LauncherDX11::func; break; \
+		default: ADLASSERT(0); break; \
+		};
+	#else
+	#define SELECT_LAUNCHER( type, func ) \
+		switch( type ) \
+		{ \
+		case TYPE_DX11: LauncherDX11::func; break; \
+		default: ADLASSERT(0); break; \
+		};
+	#endif
+#else
+	#if defined(ADL_ENABLE_CL)
+	#define SELECT_LAUNCHER( type, func ) \
+		switch( type ) \
+		{ \
+		case TYPE_CL: LauncherCL::func; break; \
+		default: ADLASSERT(0); break; \
+		};
+	#else
+	#define SELECT_LAUNCHER( type, func ) \
+		switch( type ) \
+		{ \
+		default: ADLASSERT(0); break; \
+		};
+	#endif
+#endif
+
+Launcher::Launcher(const Device *dd, char *fileName, char *funcName, char *option)
+{
+	m_kernel = dd->getKernel( fileName, funcName, option );
+	m_deviceData = dd;
+	m_idx = 0;
+	m_idxRw = 0;
+}
+
+Launcher::Launcher(const Device* dd, Kernel* kernel)
+{
+	m_kernel = kernel;
+	m_deviceData = dd;
+	m_idx = 0;
+	m_idxRw = 0;
+}
+
+void Launcher::setBuffers( BufferInfo* buffInfo, int n )
+{
+	SELECT_LAUNCHER( m_deviceData->m_type, setBuffers( this, buffInfo, n ) );
+}
+
+template<typename T>
+void Launcher::setConst( Buffer<T>& constBuff, const T& consts )
+{
+	SELECT_LAUNCHER( m_deviceData->m_type, setConst( this, constBuff, consts ) );
+}
+
+void Launcher::launch1D( int numThreads, int localSize )
+{
+	SELECT_LAUNCHER( m_deviceData->m_type, launch2D( this, numThreads, 1, localSize, 1 ) );
+}
+
+void Launcher::launch2D(  int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
+{
+	SELECT_LAUNCHER( m_deviceData->m_type, launch2D( this, numThreadsX, numThreadsY, localSizeX, localSizeY ) );
+}
+
+#undef SELECT_LAUNCHER
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlStopwatch.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlStopwatch.h
@@ -0,0 +1,81 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+#include <windows.h>
+
+namespace adl
+{
+
+struct StopwatchBase
+{
+	__inline
+	StopwatchBase(): m_device(0){}
+	__inline
+	StopwatchBase( const Device* deviceData ){ init(deviceData); }
+	__inline
+	virtual ~StopwatchBase(){}
+
+	__inline
+	virtual void init( const Device* deviceData ) = 0;
+	__inline
+	virtual void start() = 0;
+	__inline
+	virtual void split() = 0;
+	__inline
+	virtual void stop() = 0;
+	__inline
+	virtual float getMs(int index=0) = 0;
+	__inline
+	virtual void getMs( float* times, int capacity ) = 0;
+	__inline
+	int getNIntervals() const{ return m_idx-1;}
+
+	enum
+	{
+		CAPACITY = 64,
+	};
+
+	const Device* m_device;
+	int m_idx;
+};
+
+struct Stopwatch
+{
+	__inline
+	Stopwatch( const Device* deviceData = NULL ) { m_impl=0; if(deviceData) init(deviceData);}
+	__inline
+	~Stopwatch();
+
+	__inline
+	void init( const Device* deviceData );
+	__inline
+	void start(){if(!m_impl) init(0); m_impl->start();}
+	__inline
+	void split(){m_impl->split();}
+	__inline
+	void stop(){m_impl->stop();}
+	__inline
+	float getMs(){ return m_impl->getMs();}
+	__inline
+	void getMs( float* times, int capacity ){m_impl->getMs(times, capacity);}
+	__inline
+	int getNIntervals() const{return m_impl->getNIntervals();}
+
+	StopwatchBase* m_impl;
+};
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlStopwatch.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlStopwatch.inl
@@ -0,0 +1,59 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+namespace adl
+{
+
+void Stopwatch::init( const Device* deviceData )
+{
+	ADLASSERT( m_impl == 0 );
+
+	if( deviceData )
+	{
+		switch( deviceData->m_type )
+		{
+#if defined(ADL_ENABLE_CL)
+		case TYPE_CL:
+			m_impl = new StopwatchHost;//StopwatchCL
+			break;
+#endif
+#if defined(ADL_ENABLE_DX11)
+		case TYPE_DX11:
+			m_impl = new StopwatchHost;//StopwatchDX11;
+			break;
+#endif
+		case TYPE_HOST:
+			m_impl = new StopwatchHost;
+			break;
+		default:
+			ADLASSERT(0);
+			break;
+		};
+	}
+	else
+	{
+		m_impl = new StopwatchHost;
+	}
+	m_impl->init( deviceData );
+}
+
+Stopwatch::~Stopwatch()
+{
+	if( m_impl == 0 ) return;
+	delete m_impl;
+}
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/CL/AdlCL.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/CL/AdlCL.inl
@@ -0,0 +1,384 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+#pragma comment(lib,"OpenCL.lib")
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#include <CL/cl_platform.h>
+
+namespace adl
+{
+
+struct DeviceCL : public Device
+{
+	typedef DeviceUtils::Config Config;
+
+
+	__inline
+	DeviceCL() : Device( TYPE_CL ), m_kernelManager(0){}
+	__inline
+	void* getContext() const { return m_context; }
+	__inline
+	void initialize(const Config& cfg);
+	__inline
+	void release();
+
+	template<typename T>
+	__inline
+	void allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type);
+
+	template<typename T>
+	__inline
+	void deallocate(Buffer<T>* buf);
+
+	template<typename T>
+	__inline
+	void copy(Buffer<T>* dst, const Buffer<T>* src, int nElems,int srcOffsetNElems = 0,int dstOffsetNElems = 0);
+
+	template<typename T>
+	__inline
+	void copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems = 0);
+
+	template<typename T>
+	__inline
+	void copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems = 0);
+
+	__inline
+	void waitForCompletion() const;
+
+	__inline
+	void getDeviceName( char nameOut[128] ) const;
+
+	__inline
+	static
+	int getNDevices();
+
+	__inline
+	Kernel* getKernel(const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL, bool cacheKernel = true )const;
+
+
+	enum
+	{
+		MAX_NUM_DEVICES = 6,
+	};
+	
+	cl_context m_context;
+	cl_command_queue m_commandQueue;
+
+	cl_device_id m_deviceIdx;
+
+	KernelManager* m_kernelManager;
+};
+
+//===
+//===
+
+void DeviceCL::initialize(const Config& cfg)
+{
+//	DeviceUtils::create( cfg, (DeviceCL*)this );
+	{
+//		dd = new DeviceCL();
+
+		DeviceCL* deviceData = (DeviceCL*)this;
+
+//		cl_device_type deviceType = (driverType == DRIVER_HARDWARE)? CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU;
+		cl_device_type deviceType = (cfg.m_type== Config::DEVICE_GPU)? CL_DEVICE_TYPE_GPU: CL_DEVICE_TYPE_CPU;
+//		int numContextQueuePairsToCreate = 1;
+		bool enableProfiling = false;
+#ifdef _DEBUG
+		enableProfiling = true;
+#endif
+		cl_int status;
+
+		cl_platform_id platform;
+		{
+			cl_uint nPlatforms = 0;
+			status = clGetPlatformIDs(0, NULL, &nPlatforms);
+			ADLASSERT( status == CL_SUCCESS );
+
+			cl_platform_id pIdx[5];
+			status = clGetPlatformIDs(nPlatforms, pIdx, NULL);
+			ADLASSERT( status == CL_SUCCESS );
+
+			cl_uint atiIdx = -1;
+			cl_uint intelIdx = -1;
+			cl_uint nvIdx = -1;
+
+			for(cl_uint i=0; i<nPlatforms; i++)
+			{
+				char buff[512];
+				status = clGetPlatformInfo( pIdx[i], CL_PLATFORM_VENDOR, 512, buff, 0 );
+				ADLASSERT( status == CL_SUCCESS );
+
+				//skip the platform if there are no devices available
+				cl_uint numDevice;
+				status = clGetDeviceIDs( pIdx[i], deviceType, 0, NULL, &numDevice );
+				if (numDevice>0)
+				{
+					if( strcmp( buff, "NVIDIA Corporation" )==0 ) nvIdx = i;
+					if( strcmp( buff, "Advanced Micro Devices, Inc." )==0 ) atiIdx = i;
+					if( strcmp( buff, "Intel(R) Corporation" )==0 ) intelIdx = i;
+				}
+			}
+
+			if( deviceType == CL_DEVICE_TYPE_GPU )
+			{
+				switch( cfg.m_vendor )
+				{
+				case DeviceUtils::Config::VD_AMD:
+					if( atiIdx == -1 && nvIdx != -1 ) goto USE_NV_GPU;
+USE_AMD_GPU:
+					ADLASSERT(atiIdx != -1 );
+					platform = pIdx[atiIdx];
+					break;
+				case DeviceUtils::Config::VD_NV:
+					if( atiIdx != -1 && nvIdx == -1 ) goto USE_AMD_GPU;
+USE_NV_GPU:
+					ADLASSERT(nvIdx != -1 );
+					platform = pIdx[nvIdx];
+					break;
+				default:
+					ADLASSERT(0);
+					break;
+				};
+			}
+			else if( deviceType == CL_DEVICE_TYPE_CPU )
+			{
+				switch( cfg.m_vendor )
+				{
+				case DeviceUtils::Config::VD_AMD:
+					ADLASSERT(atiIdx != -1 );
+					platform = pIdx[atiIdx];
+					break;
+				case DeviceUtils::Config::VD_INTEL:
+					ADLASSERT(intelIdx != -1 );
+					platform = pIdx[intelIdx];
+					break;
+				default:
+					ADLASSERT(0);
+					break;
+				};
+			}
+		}
+
+		cl_uint numDevice;
+		status = clGetDeviceIDs( platform, deviceType, 0, NULL, &numDevice );
+
+//		ADLASSERT( cfg.m_deviceIdx < (int)numDevice );
+
+		debugPrintf("CL: %d %s Devices ", numDevice, (deviceType==CL_DEVICE_TYPE_GPU)? "GPU":"CPU");
+
+//		numContextQueuePairsToCreate = min( (int)numDevice, numContextQueuePairsToCreate );
+//		numContextQueuePairsToCreate = ( (int)numDevice < numContextQueuePairsToCreate )? numDevice : numContextQueuePairsToCreate;
+		
+		cl_device_id deviceIds[ MAX_NUM_DEVICES ];
+
+		status = clGetDeviceIDs( platform, deviceType, numDevice, deviceIds, NULL );
+		ADLASSERT( status == CL_SUCCESS );
+
+		{	int i = min( (int)numDevice-1, cfg.m_deviceIdx );
+			m_deviceIdx = deviceIds[i];
+			deviceData->m_context = clCreateContext( NULL, 1, &deviceData->m_deviceIdx, NULL, NULL, &status );
+			ADLASSERT( status == CL_SUCCESS );
+
+			char buff[512];
+			status = clGetDeviceInfo( deviceData->m_deviceIdx, CL_DEVICE_NAME, sizeof(buff), &buff, NULL );
+			ADLASSERT( status == CL_SUCCESS );
+
+			debugPrintf("[%s]\n", buff);
+
+			deviceData->m_commandQueue = clCreateCommandQueue( deviceData->m_context, deviceData->m_deviceIdx, (enableProfiling)?CL_QUEUE_PROFILING_ENABLE:NULL, NULL );
+
+			ADLASSERT( status == CL_SUCCESS );
+
+		//	status = clSetCommandQueueProperty( commandQueue, CL_QUEUE_PROFILING_ENABLE, CL_TRUE, 0 );
+		//	CLASSERT( status == CL_SUCCESS );
+
+			if(0)
+			{
+				cl_bool image_support;
+				clGetDeviceInfo(deviceData->m_deviceIdx, CL_DEVICE_IMAGE_SUPPORT, sizeof(image_support), &image_support, NULL);
+				debugPrintf("	CL_DEVICE_IMAGE_SUPPORT : %s\n", image_support?"Yes":"No");
+			}
+		}
+	}
+
+	m_kernelManager = new KernelManager;
+}
+
+void DeviceCL::release()
+{
+	clReleaseCommandQueue( m_commandQueue );
+	clReleaseContext( m_context );
+
+	if( m_kernelManager ) delete m_kernelManager;
+}
+
+template<typename T>
+void DeviceCL::allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type)
+{
+	buf->m_device = this;
+	buf->m_size = nElems;
+	buf->m_ptr = 0;
+
+	if( type == BufferBase::BUFFER_CONST ) return;
+
+#if defined(ADL_CL_DUMP_MEMORY_LOG)
+	char deviceName[256];
+	getDeviceName( deviceName );
+   	printf( "adlCLMemoryLog	%s : %3.2fMB	Allocation: %3.2fKB ", deviceName, m_memoryUsage/1024.f/1024.f, sizeof(T)*nElems/1024.f );
+	fflush( stdout );
+#endif
+
+	int sz=sizeof(T)*nElems;
+
+	cl_int status = 0;
+	if( type == BufferBase::BUFFER_ZERO_COPY )
+		buf->m_ptr = (T*)clCreateBuffer( m_context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sz, 0, &status );
+	else if( type == BufferBase::BUFFER_RAW )
+		buf->m_ptr = (T*)clCreateBuffer( m_context, CL_MEM_WRITE_ONLY, sz, 0, &status );
+	else
+		buf->m_ptr = (T*)clCreateBuffer( m_context, CL_MEM_READ_WRITE, sz, 0, &status );
+
+	m_memoryUsage += buf->m_size*sizeof(T);
+#if defined(ADL_CL_DUMP_MEMORY_LOG)
+	printf( "%s\n", (status==CL_SUCCESS)? "Succeed": "Failed" );
+	fflush( stdout );
+#endif
+	ADLASSERT( status == CL_SUCCESS );
+}
+
+template<typename T>
+void DeviceCL::deallocate(Buffer<T>* buf)
+{
+	if( buf->m_ptr )
+	{
+		m_memoryUsage -= buf->m_size*sizeof(T);
+		clReleaseMemObject( (cl_mem)buf->m_ptr );
+	}
+	buf->m_device = 0;
+	buf->m_size = 0;
+	buf->m_ptr = 0;
+}
+
+template<typename T>
+void DeviceCL::copy(Buffer<T>* dst, const Buffer<T>* src, int nElems,int srcOffsetNElems,int dstOffsetNElems )
+{
+	if( dst->m_device->m_type == TYPE_CL && src->m_device->m_type == TYPE_CL )
+	{
+		cl_int status = 0;
+		status = clEnqueueCopyBuffer( m_commandQueue, (cl_mem)src->m_ptr, (cl_mem)dst->m_ptr, sizeof(T)*srcOffsetNElems, sizeof(T)*dstOffsetNElems, sizeof(T)*nElems, 0, 0, 0 );
+		ADLASSERT( status == CL_SUCCESS );
+	}
+	else if( src->m_device->m_type == TYPE_HOST )
+	{
+		ADLASSERT( dst->getType() == TYPE_CL );
+		dst->write( src->m_ptr, nElems );
+	}
+	else if( dst->m_device->m_type == TYPE_HOST )
+	{
+		ADLASSERT( src->getType() == TYPE_CL );
+		src->read( dst->m_ptr, nElems );
+	}
+	else
+	{
+		ADLASSERT( 0 );
+	}
+}
+
+template<typename T>
+void DeviceCL::copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems )
+{
+	cl_int status = 0;
+	status = clEnqueueReadBuffer( m_commandQueue, (cl_mem)src->m_ptr, 0, sizeof(T)*srcOffsetNElems, sizeof(T)*nElems,
+		dst, 0,0,0 );
+	ADLASSERT( status == CL_SUCCESS );
+}
+
+template<typename T>
+void DeviceCL::copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems )
+{
+	cl_int status = 0;
+	int sz=sizeof(T)*nElems;
+	status = clEnqueueWriteBuffer( m_commandQueue, (cl_mem)dst->m_ptr, 0, sizeof(T)*dstOffsetNElems, sz,
+		src, 0,0,0 );
+	ADLASSERT( status == CL_SUCCESS );
+}
+
+void DeviceCL::waitForCompletion() const
+{
+	clFinish( m_commandQueue );
+}
+
+int DeviceCL::getNDevices()
+{
+	cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
+	cl_int status;
+
+	cl_platform_id platform;
+	{
+		cl_uint nPlatforms = 0;
+		status = clGetPlatformIDs(0, NULL, &nPlatforms);
+		ADLASSERT( status == CL_SUCCESS );
+
+		cl_platform_id pIdx[5];
+		status = clGetPlatformIDs(nPlatforms, pIdx, NULL);
+		ADLASSERT( status == CL_SUCCESS );
+
+		cl_uint nvIdx = -1;
+		cl_uint atiIdx = -1;
+		for(cl_uint i=0; i<nPlatforms; i++)
+		{
+			char buff[512];
+			status = clGetPlatformInfo( pIdx[i], CL_PLATFORM_VENDOR, 512, buff, 0 );
+			ADLASSERT( status == CL_SUCCESS );
+
+			if( strcmp( buff, "NVIDIA Corporation" )==0 ) nvIdx = i;
+			if( strcmp( buff, "Advanced Micro Devices, Inc." )==0 ) atiIdx = i;
+		}
+
+		if( deviceType == CL_DEVICE_TYPE_GPU )
+		{
+			if( nvIdx != -1 ) platform = pIdx[nvIdx];
+			else platform = pIdx[atiIdx];
+		}
+		else if( deviceType == CL_DEVICE_TYPE_CPU )
+		{
+			platform = pIdx[atiIdx];
+		}
+	}
+
+	cl_uint numDevice;
+	status = clGetDeviceIDs( platform, deviceType, 0, NULL, &numDevice );
+	ADLASSERT( status == CL_SUCCESS );
+
+	return numDevice;
+}
+
+void DeviceCL::getDeviceName( char nameOut[128] ) const
+{
+	cl_int status;
+	status = clGetDeviceInfo( m_deviceIdx, CL_DEVICE_NAME, sizeof(char)*128, nameOut, NULL );
+	ADLASSERT( status == CL_SUCCESS );
+}
+
+Kernel* DeviceCL::getKernel(const char* fileName, const char* funcName, const char* option, const char* src, bool cacheKernel )const
+{
+	return m_kernelManager->query( this, fileName, funcName, option, src, cacheKernel );
+}
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/CL/AdlKernelUtilsCL.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/CL/AdlKernelUtilsCL.inl
@@ -0,0 +1,541 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+
+
+namespace adl
+{
+
+struct KernelCL : public Kernel
+{
+	cl_kernel& getKernel() { return (cl_kernel&)m_kernel; }
+};
+
+static const char* strip(const char* name, const char* pattern)
+{
+	  size_t const patlen = strlen(pattern);
+  	size_t patcnt = 0;
+	  const char * oriptr;
+	  const char * patloc;
+		// find how many times the pattern occurs in the original string
+	  for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
+	  {
+		patcnt++;
+	  }
+	  return oriptr;
+}
+
+static bool isFileUpToDate(const char* binaryFileName,const char* srcFileName)
+
+{
+	bool fileUpToDate = false;
+
+	bool binaryFileValid=false;
+	FILETIME modtimeBinary; 
+
+	int nameLength = (int)strlen(binaryFileName)+1;
+#ifdef UNICODE
+	WCHAR* fName = new WCHAR[nameLength];
+	MultiByteToWideChar(CP_ACP,0,binaryFileName,-1, fName, nameLength);
+	HANDLE binaryFileHandle = CreateFile(fName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+	delete [] fName;
+#else
+	HANDLE binaryFileHandle = CreateFile(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+#endif
+	if (binaryFileHandle ==INVALID_HANDLE_VALUE)
+	{
+		DWORD errorCode;
+		errorCode = GetLastError();
+		switch (errorCode)
+		{
+		case ERROR_FILE_NOT_FOUND:
+			{
+				debugPrintf("\nCached file not found %s\n", binaryFileName);
+				break;
+			}
+		case ERROR_PATH_NOT_FOUND:
+			{
+				debugPrintf("\nCached file path not found %s\n", binaryFileName);
+				break;
+			}
+		default:
+			{
+				debugPrintf("\nFailed reading cached file with errorCode = %d\n", errorCode);
+			}
+		}
+	} else
+	{
+		if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0)
+		{
+			DWORD errorCode;
+			errorCode = GetLastError();
+			debugPrintf("\nGetFileTime errorCode = %d\n", errorCode);
+		} else
+		{
+			binaryFileValid = true;
+		}
+		CloseHandle(binaryFileHandle);
+	}
+
+	if (binaryFileValid)
+	{
+#ifdef UNICODE
+		int nameLength = (int)strlen(srcFileName)+1;
+		WCHAR* fName = new WCHAR[nameLength];
+		MultiByteToWideChar(CP_ACP,0,srcFileName,-1, fName, nameLength);
+		HANDLE srcFileHandle = CreateFile(fName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+		delete [] fName;
+#else
+		HANDLE srcFileHandle = CreateFile(srcFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+#endif
+		if (srcFileHandle!=INVALID_HANDLE_VALUE)
+		{
+			FILETIME modtimeSrc; 
+			if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0)
+			{
+				DWORD errorCode;
+				errorCode = GetLastError();
+				debugPrintf("\nGetFileTime errorCode = %d\n", errorCode);
+			}
+			if (  ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime)
+				||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime)))
+			{
+				fileUpToDate=true;
+			} else
+			{
+				debugPrintf("\nCached binary file found (%s), but out-of-date\n",binaryFileName);
+			}
+			CloseHandle(srcFileHandle);
+		} 
+		else
+		{
+#ifdef _DEBUG
+			DWORD errorCode;
+			errorCode = GetLastError();
+			switch (errorCode)
+			{
+			case ERROR_FILE_NOT_FOUND:
+				{
+					debugPrintf("\nSrc file not found %s\n", srcFileName);
+					break;
+				}
+			case ERROR_PATH_NOT_FOUND:
+				{
+					debugPrintf("\nSrc path not found %s\n", srcFileName);
+					break;
+				}
+			default:
+				{
+					debugPrintf("\nnSrc file reading errorCode = %d\n", errorCode);
+				}
+			}
+			ADLASSERT(0);
+#else
+			//if we cannot find the source, assume it is OK in release builds
+			fileUpToDate = true;
+#endif
+		}
+	}
+			
+
+	return fileUpToDate;
+}
+
+template<>
+void KernelBuilder<TYPE_CL>::setFromFile( const Device* deviceData, const char* fileName, const char* option, bool addExtension,
+	bool cacheKernel)
+{
+	m_deviceData = deviceData;
+
+	char fileNameWithExtension[256];
+
+	if( addExtension )
+		sprintf_s( fileNameWithExtension, "%s.cl", fileName );
+	else
+		sprintf_s( fileNameWithExtension, "%s", fileName );
+
+	class File
+	{
+		public:
+			__inline
+			bool open(const char* fileNameWithExtension)
+			{
+				size_t      size;
+				char*       str;
+
+				// Open file stream
+				std::fstream f(fileNameWithExtension, (std::fstream::in | std::fstream::binary));
+
+				// Check if we have opened file stream
+				if (f.is_open()) {
+					size_t  sizeFile;
+					// Find the stream size
+					f.seekg(0, std::fstream::end);
+					size = sizeFile = (size_t)f.tellg();
+					f.seekg(0, std::fstream::beg);
+
+					str = new char[size + 1];
+					if (!str) {
+						f.close();
+						return  NULL;
+					}
+
+					// Read file
+					f.read(str, sizeFile);
+					f.close();
+					str[size] = '\0';
+
+					m_source  = str;
+
+					delete[] str;
+
+					return true;
+				}
+
+				return false;
+			}
+			const std::string& getSource() const {return m_source;}
+
+		private:
+			std::string m_source;
+	};
+
+	cl_program& program = (cl_program&)m_ptr;
+	cl_int status = 0;
+
+	bool cacheBinary = cacheKernel;
+#if defined(ADL_CL_FORCE_UNCACHE_KERNEL)
+	cacheBinary = false;
+#endif
+
+	char binaryFileName[512];
+	{
+		char deviceName[256];
+		deviceData->getDeviceName(deviceName);
+		char driverVersion[256];
+		const DeviceCL* dd = (const DeviceCL*) deviceData;
+		clGetDeviceInfo(dd->m_deviceIdx, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
+		const char* strippedFileName = strip(fileName,"\\");
+		strippedFileName = strip(strippedFileName,"/");
+
+		sprintf_s(binaryFileName,"cache/%s.%s.%s.bin",strippedFileName, deviceName,driverVersion );
+	}
+
+	bool upToDate = isFileUpToDate(binaryFileName,fileNameWithExtension);
+
+	if( cacheBinary && upToDate)
+	{
+		FILE* file = fopen(binaryFileName, "rb");
+
+		if( file )
+		{
+			fseek( file, 0L, SEEK_END );
+			size_t binarySize = ftell( file );
+
+			rewind( file );
+			char* binary = new char[binarySize];
+			fread( binary, sizeof(char), binarySize, file );
+			fclose( file );
+
+			if (binarySize)
+			{
+				const DeviceCL* dd = (const DeviceCL*) deviceData;
+				program = clCreateProgramWithBinary( dd->m_context, 1, &dd->m_deviceIdx, &binarySize, (const unsigned char**)&binary, 0, &status );
+				ADLASSERT( status == CL_SUCCESS );
+				status = clBuildProgram( program, 1, &dd->m_deviceIdx, option, 0, 0 );
+				ADLASSERT( status == CL_SUCCESS );
+			if( status != CL_SUCCESS )
+			{
+				char *build_log;
+				size_t ret_val_size;
+				clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+				build_log = new char[ret_val_size+1];
+				clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+
+				build_log[ret_val_size] = '\0';
+
+				debugPrintf("%s\n", build_log);
+
+				delete build_log;
+				ADLASSERT(0);
+				}
+
+			}
+		}
+	}
+	if( !m_ptr )
+	{
+		File kernelFile;
+		ADLASSERT( kernelFile.open( fileNameWithExtension ) );
+		const char* source = kernelFile.getSource().c_str();
+		setFromSrc( m_deviceData, source, option );
+
+		if( cacheBinary )
+		{	//	write to binary
+			size_t binarySize;
+			status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
+			ADLASSERT( status == CL_SUCCESS );
+
+			char* binary = new char[binarySize];
+
+			status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
+			ADLASSERT( status == CL_SUCCESS );
+
+			{
+				FILE* file = fopen(binaryFileName, "wb");
+				if (file)
+				{
+					fwrite( binary, sizeof(char), binarySize, file );
+					fclose( file );
+				}
+			}
+
+			delete [] binary;
+		}
+	}
+}
+
+
+
+template<>
+void KernelBuilder<TYPE_CL>::setFromSrcCached( const Device* deviceData, const char* src, const char* fileName, const char* option )
+{
+	m_deviceData = deviceData;
+
+	bool cacheBinary = true;
+	cl_program& program = (cl_program&)m_ptr;
+	cl_int status = 0;	
+	
+	char binaryFileName[512];
+	{
+		char deviceName[256];
+		deviceData->getDeviceName(deviceName);
+		char driverVersion[256];
+		const DeviceCL* dd = (const DeviceCL*) deviceData;
+		clGetDeviceInfo(dd->m_deviceIdx, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
+		
+		const char* strippedFileName = strip(fileName,"\\");
+		strippedFileName = strip(strippedFileName,"/");
+
+		sprintf_s(binaryFileName,"cache/%s.%s.%s.bin",strippedFileName, deviceName,driverVersion );
+	}
+
+	
+	char fileNameWithExtension[256];
+	sprintf_s(fileNameWithExtension,"%s.cl",fileName, ".cl");
+
+	bool upToDate = isFileUpToDate(binaryFileName,fileNameWithExtension);
+
+
+	if( cacheBinary )
+	{
+		
+		bool fileUpToDate = isFileUpToDate(binaryFileName,fileNameWithExtension);
+
+		if( fileUpToDate)
+		{
+			FILE* file = fopen(binaryFileName, "rb");
+			if (file)
+			{
+				fseek( file, 0L, SEEK_END );
+				size_t binarySize = ftell( file );
+				rewind( file );
+				char* binary = new char[binarySize];
+				fread( binary, sizeof(char), binarySize, file );
+				fclose( file );
+
+				const DeviceCL* dd = (const DeviceCL*) deviceData;
+				program = clCreateProgramWithBinary( dd->m_context, 1, &dd->m_deviceIdx, &binarySize, (const unsigned char**)&binary, 0, &status );
+				ADLASSERT( status == CL_SUCCESS );
+				status = clBuildProgram( program, 1, &dd->m_deviceIdx, option, 0, 0 );
+				ADLASSERT( status == CL_SUCCESS );
+
+				if( status != CL_SUCCESS )
+				{
+					char *build_log;
+					size_t ret_val_size;
+					clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+					build_log = new char[ret_val_size+1];
+					clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+
+					build_log[ret_val_size] = '\0';
+
+					debugPrintf("%s\n", build_log);
+
+					delete build_log;
+					ADLASSERT(0);
+				}
+				delete[] binary;
+			}
+		}
+	}
+
+
+	if( !m_ptr )
+	{
+		
+		setFromSrc( deviceData, src, option );
+
+		if( cacheBinary )
+		{	//	write to binary
+			cl_uint numAssociatedDevices;
+			status = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 );
+			ADLASSERT( status == CL_SUCCESS );
+			if (numAssociatedDevices==1)
+			{
+			
+
+				size_t binarySize;
+				status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
+				ADLASSERT( status == CL_SUCCESS );
+
+				char* binary = new char[binarySize];
+
+				status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
+				ADLASSERT( status == CL_SUCCESS );
+
+				{
+					FILE* file = fopen(binaryFileName, "wb");
+					if (file)
+					{
+						fwrite( binary, sizeof(char), binarySize, file );
+						fclose( file );
+					}
+				}
+
+				delete [] binary;
+			}
+		}
+	}
+}
+
+
+template<>
+void KernelBuilder<TYPE_CL>::setFromSrc( const Device* deviceData, const char* src, const char* option )
+{
+	ADLASSERT( deviceData->m_type == TYPE_CL );
+	m_deviceData = deviceData;
+	const DeviceCL* dd = (const DeviceCL*) deviceData;
+
+	cl_program& program = (cl_program&)m_ptr;
+	cl_int status = 0;
+	size_t srcSize[] = {strlen( src )};
+	program = clCreateProgramWithSource( dd->m_context, 1, &src, srcSize, &status );
+	ADLASSERT( status == CL_SUCCESS );
+	status = clBuildProgram( program, 1, &dd->m_deviceIdx, option, NULL, NULL );
+	if( status != CL_SUCCESS )
+	{
+		char *build_log;
+		size_t ret_val_size;
+		clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+		build_log = new char[ret_val_size+1];
+		clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+
+		build_log[ret_val_size] = '\0';
+
+		debugPrintf("%s\n", build_log);
+		printf("%s\n", build_log);
+
+		ADLASSERT(0);
+		delete build_log;
+		
+	}
+}
+
+template<>
+KernelBuilder<TYPE_CL>::~KernelBuilder()
+{
+	cl_program program = (cl_program)m_ptr;
+	clReleaseProgram( program );
+}
+
+template<>
+void KernelBuilder<TYPE_CL>::createKernel( const char* funcName, Kernel& kernelOut )
+{
+	KernelCL* clKernel = (KernelCL*)&kernelOut;
+
+	cl_program program = (cl_program)m_ptr;
+	cl_int status = 0;
+	clKernel->getKernel() = clCreateKernel(program, funcName, &status );
+	ADLASSERT( status == CL_SUCCESS );
+
+	kernelOut.m_type = TYPE_CL;
+}
+
+template<>
+void KernelBuilder<TYPE_CL>::deleteKernel( Kernel& kernel )
+{
+	KernelCL* clKernel = (KernelCL*)&kernel;
+	clReleaseKernel( clKernel->getKernel() );
+}
+
+
+
+class LauncherCL
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		__inline
+		static void setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n );
+		template<typename T>
+		__inline
+		static void setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts );
+		__inline
+		static void launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY );
+};
+
+void LauncherCL::setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n )
+{
+	KernelCL* clKernel = (KernelCL*)launcher->m_kernel;
+	for(int i=0; i<n; i++)
+	{
+		Buffer<int>* buff = (Buffer<int>*)buffInfo[i].m_buffer;
+		cl_int status = clSetKernelArg( clKernel->getKernel(), launcher->m_idx++, sizeof(cl_mem), &buff->m_ptr );
+		ADLASSERT( status == CL_SUCCESS );
+	}
+}
+
+template<typename T>
+void LauncherCL::setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts )
+{
+	KernelCL* clKernel = (KernelCL*)launcher->m_kernel;
+	int sz=sizeof(T);
+	cl_int status = clSetKernelArg( clKernel->getKernel(), launcher->m_idx++, sz, &consts );
+	ADLASSERT( status == CL_SUCCESS );
+}
+
+void LauncherCL::launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
+{
+	KernelCL* clKernel = (KernelCL*)launcher->m_kernel;
+	const DeviceCL* ddcl = (const DeviceCL*)launcher->m_deviceData;
+	size_t gRange[3] = {1,1,1};
+	size_t lRange[3] = {1,1,1};
+	lRange[0] = localSizeX;
+	lRange[1] = localSizeY;
+	gRange[0] = max((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
+	gRange[0] *= lRange[0];
+	gRange[1] = max((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
+	gRange[1] *= lRange[1];
+
+	cl_int status = clEnqueueNDRangeKernel( ddcl->m_commandQueue, 
+		clKernel->getKernel(), 2, NULL, gRange, lRange, 0,0,0 );
+	ADLASSERT( status == CL_SUCCESS );
+}
+
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/DX11/AdlDX11.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/DX11/AdlDX11.inl
@@ -0,0 +1,512 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#include <windows.h>
+#include <d3d11.h>
+#include <d3dx11.h>
+#include <d3dcompiler.h>
+#include <DXGI.h>
+#pragma comment(lib,"d3dx11.lib")
+#pragma comment(lib,"d3d11.lib")
+#pragma comment(lib,"DXGI.lib")
+
+namespace adl
+{
+
+#define u32 unsigned int
+
+struct DeviceDX11 : public Device
+{
+	typedef DeviceUtils::Config Config;
+
+
+	__inline
+	DeviceDX11() : Device( TYPE_DX11 ), m_kernelManager(0){}
+	__inline
+	void* getContext() const { return m_context; }
+	__inline
+	void initialize(const Config& cfg);
+	__inline
+	void release();
+
+	template<typename T>
+	__inline
+	void allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type);
+
+	template<typename T>
+	__inline
+	void deallocate(Buffer<T>* buf);
+
+	template<typename T>
+	__inline
+	void copy(Buffer<T>* dst, const Buffer<T>* src, int nElems);
+
+	template<typename T>
+	__inline
+	void copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems = 0);
+
+	template<typename T>
+	__inline
+	void copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems = 0);
+
+	__inline
+	void waitForCompletion() const;
+
+	__inline
+	void getDeviceName( char nameOut[128] ) const;
+
+	__inline
+	static
+	int getNDevices();
+
+	__inline
+	Kernel* getKernel(const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL, bool cacheKernel = true )const;
+
+
+	ID3D11DeviceContext* m_context;
+	ID3D11Device* m_device;
+	IDXGISwapChain* m_swapChain;
+
+	KernelManager* m_kernelManager;
+};
+
+template<typename T>
+struct BufferDX11 : public Buffer<T>
+{
+	ID3D11Buffer* getBuffer() { return (ID3D11Buffer*)m_ptr; }
+	ID3D11UnorderedAccessView* getUAV() { return (ID3D11UnorderedAccessView*)m_uav; }
+	ID3D11ShaderResourceView* getSRV() { return (ID3D11ShaderResourceView*)m_srv; }
+
+	ID3D11Buffer** getBufferPtr() { return (ID3D11Buffer**)&m_ptr; }
+	ID3D11UnorderedAccessView** getUAVPtr() { return (ID3D11UnorderedAccessView**)&m_uav; }
+	ID3D11ShaderResourceView** getSRVPtr() { return (ID3D11ShaderResourceView**)&m_srv; }
+};
+
+#define SAFE_RELEASE(p)      { if(p) { (p)->Release(); (p)=NULL; } }
+
+
+void DeviceDX11::initialize(const Config& cfg)
+{
+	DeviceDX11* deviceData = this;
+
+	HRESULT hr = S_OK;
+	UINT createDeviceFlg = 0;
+#ifdef _DEBUG
+	createDeviceFlg |= D3D11_CREATE_DEVICE_DEBUG;
+#endif
+	D3D_FEATURE_LEVEL fl[] = {
+		D3D_FEATURE_LEVEL_11_0,
+		D3D_FEATURE_LEVEL_10_1,
+		D3D_FEATURE_LEVEL_10_0
+	};
+
+typedef HRESULT (WINAPI * LPD3D11CREATEDEVICE)( IDXGIAdapter*, D3D_DRIVER_TYPE, HMODULE, u32, D3D_FEATURE_LEVEL*, UINT, u32, ID3D11Device**, D3D_FEATURE_LEVEL*, ID3D11DeviceContext** );
+
+	HMODULE moduleD3D11 = 0; 
+#ifdef UNICODE
+	moduleD3D11 = LoadLibrary( L"d3d11.dll" );
+#else
+	moduleD3D11 = LoadLibrary( "d3d11.dll" );
+#endif
+	ADLASSERT( moduleD3D11 );
+
+	LPD3D11CREATEDEVICE _DynamicD3D11CreateDevice; 
+	_DynamicD3D11CreateDevice = ( LPD3D11CREATEDEVICE )GetProcAddress( moduleD3D11, "D3D11CreateDevice" );
+
+	D3D_DRIVER_TYPE type = D3D_DRIVER_TYPE_HARDWARE;
+	//	http://msdn.microsoft.com/en-us/library/ff476082(v=VS.85).aspx
+	//	If you set the pAdapter parameter to a non-NULL value, you must also set the DriverType parameter to the D3D_DRIVER_TYPE_UNKNOWN value. If you set the pAdapter parameter to a non-NULL value and the DriverType parameter to the D3D_DRIVER_TYPE_HARDWARE value, D3D11CreateDevice returns an HRESULT of E_INVALIDARG.
+	type = D3D_DRIVER_TYPE_UNKNOWN;
+/*
+	// Create a hardware Direct3D 11 device
+	hr = _DynamicD3D11CreateDevice( NULL, 
+		type, NULL, createDeviceFlg,
+		fl, _countof(fl), D3D11_SDK_VERSION, &deviceData->m_device, NULL, &deviceData->m_context );
+*/
+	IDXGIAdapter* adapter = NULL;
+	{//	get adapter of the index
+		IDXGIFactory* factory = NULL;
+		int targetAdapterIdx = cfg.m_deviceIdx;//min( cfg.m_deviceIdx, getNDevices()-1 );
+		CreateDXGIFactory( __uuidof(IDXGIFactory), (void**)&factory );
+
+		u32 i = 0;
+		while( factory->EnumAdapters( i, &adapter ) != DXGI_ERROR_NOT_FOUND )
+		{
+			if( i== targetAdapterIdx ) break;
+			i++;
+		}
+		factory->Release();
+	}
+
+	// Create a hardware Direct3D 11 device
+	hr = D3D11CreateDevice( adapter, 
+		type, 
+		NULL, createDeviceFlg,
+		fl, _countof(fl), D3D11_SDK_VERSION, &deviceData->m_device, NULL, &deviceData->m_context );
+
+	ADLASSERT( hr == S_OK );
+
+   // Check if the hardware device supports Compute Shader 4.0
+    D3D11_FEATURE_DATA_D3D10_X_HARDWARE_OPTIONS hwopts;
+    deviceData->m_device->CheckFeatureSupport(D3D11_FEATURE_D3D10_X_HARDWARE_OPTIONS, &hwopts, sizeof(hwopts));
+
+	if( !hwopts.ComputeShaders_Plus_RawAndStructuredBuffers_Via_Shader_4_x )
+	{
+		SAFE_RELEASE( deviceData->m_context );
+		SAFE_RELEASE( deviceData->m_device );
+
+		debugPrintf("DX11 GPU is not present\n");
+		ADLASSERT( 0 );
+	}
+
+	m_kernelManager = new KernelManager;
+}
+
+void DeviceDX11::release()
+{
+	SAFE_RELEASE( m_context );
+	SAFE_RELEASE( m_device );
+
+	if( m_kernelManager ) delete m_kernelManager;
+}
+
+template<typename T>
+void DeviceDX11::allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type)
+{
+	ADLASSERT( type != BufferBase::BUFFER_ZERO_COPY );
+
+	DeviceDX11* deviceData = this;
+	buf->m_device = deviceData;
+	buf->m_size = nElems;
+	BufferDX11<T>* dBuf = (BufferDX11<T>*)buf;
+
+//	if( type & BufferBase::BUFFER )
+	{
+		HRESULT hr = S_OK;
+
+		if( type == BufferBase::BUFFER_CONST )
+		{
+			ADLASSERT( nElems == 1 );
+			D3D11_BUFFER_DESC constant_buffer_desc;
+			ZeroMemory( &constant_buffer_desc, sizeof(constant_buffer_desc) );
+//			constant_buffer_desc.ByteWidth = NEXTMULTIPLEOF( sizeof(T), 16 );
+			constant_buffer_desc.ByteWidth = (((sizeof(T))/(16) + (((sizeof(T))%(16)==0)?0:1))*(16));
+//			constant_buffer_desc.Usage = D3D11_USAGE_DYNAMIC;
+//			constant_buffer_desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+//			constant_buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
+			constant_buffer_desc.Usage = D3D11_USAGE_DEFAULT;
+			constant_buffer_desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+			constant_buffer_desc.CPUAccessFlags = 0;
+
+			hr = deviceData->m_device->CreateBuffer( &constant_buffer_desc, NULL, dBuf->getBufferPtr() );
+			ADLASSERT( hr == S_OK );
+			return;
+		}
+
+		D3D11_BUFFER_DESC buffer_desc;
+		ZeroMemory(&buffer_desc, sizeof(buffer_desc));
+		buffer_desc.ByteWidth = nElems * sizeof(T);
+
+		if( type != BufferBase::BUFFER_RAW )
+		{
+			buffer_desc.StructureByteStride = sizeof(T);
+//		    buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+		}
+
+		if( type == BufferBase::BUFFER_STAGING )
+		{
+			buffer_desc.Usage = D3D11_USAGE_STAGING;
+		    buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+		}
+		else if( type == BufferBase::BUFFER_INDEX )
+		{
+			buffer_desc.Usage = D3D11_USAGE_DEFAULT;
+			buffer_desc.BindFlags = D3D11_BIND_INDEX_BUFFER;
+		}
+		else if( type == BufferBase::BUFFER_VERTEX )
+		{
+			buffer_desc.Usage = D3D11_USAGE_DEFAULT;
+			buffer_desc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
+		}
+		else
+		{
+			buffer_desc.Usage = D3D11_USAGE_DEFAULT;
+			
+			buffer_desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
+			buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
+
+//	check this
+			if(type == BufferBase::BUFFER_RAW)
+			{
+//				buffer_desc.BindFlags |= D3D11_BIND_INDEX_BUFFER | D3D11_BIND_VERTEX_BUFFER;
+				buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS | D3D11_RESOURCE_MISC_DRAWINDIRECT_ARGS; // need this to be used for DispatchIndirect
+			}
+		}
+		hr = deviceData->m_device->CreateBuffer(&buffer_desc, NULL, dBuf->getBufferPtr());
+
+		ADLASSERT( hr == S_OK );
+
+		if( type == BufferBase::BUFFER_INDEX ) return;
+
+		if( type == BufferBase::BUFFER || 
+			type == BufferBase::BUFFER_RAW || 
+			type == BufferBase::BUFFER_W_COUNTER )
+		{
+			// Create UAVs for all CS buffers
+			D3D11_UNORDERED_ACCESS_VIEW_DESC uavbuffer_desc;
+			ZeroMemory(&uavbuffer_desc, sizeof(uavbuffer_desc));
+			uavbuffer_desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
+
+			if( type == BufferBase::BUFFER_RAW )
+			{
+				uavbuffer_desc.Format = DXGI_FORMAT_R32_TYPELESS;
+				uavbuffer_desc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW;
+				uavbuffer_desc.Buffer.NumElements = buffer_desc.ByteWidth / 4; 
+			}
+			else
+			{
+				uavbuffer_desc.Format = DXGI_FORMAT_UNKNOWN;
+				uavbuffer_desc.Buffer.NumElements = nElems;
+			}
+
+			if( type == BufferBase::BUFFER_W_COUNTER )
+			{
+				uavbuffer_desc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_COUNTER;
+			}
+
+			hr = deviceData->m_device->CreateUnorderedAccessView(dBuf->getBuffer(), &uavbuffer_desc, dBuf->getUAVPtr());
+			ADLASSERT( hr == S_OK );
+
+			// Create SRVs for all CS buffers
+			D3D11_SHADER_RESOURCE_VIEW_DESC srvbuffer_desc;
+			ZeroMemory(&srvbuffer_desc, sizeof(srvbuffer_desc));
+			if( type == BufferBase::BUFFER_RAW )
+			{
+				ADLASSERT( sizeof(T) <= 16 );
+				srvbuffer_desc.Format = DXGI_FORMAT_R32_UINT;
+				srvbuffer_desc.Buffer.ElementWidth = nElems;
+//			if ( buffer_desc.MiscFlags & D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS )
+//			{
+//				srvbuffer_desc.Format = DXGI_FORMAT_R32_TYPELESS;
+//				srvbuffer_desc.BufferEx.Flags = D3D11_BUFFEREX_SRV_FLAG_RAW;
+//				srvbuffer_desc.BufferEx.NumElements = buffer_desc.ByteWidth / 4;
+			}
+			else
+			{
+				srvbuffer_desc.Format = DXGI_FORMAT_UNKNOWN;
+				srvbuffer_desc.Buffer.ElementWidth = nElems;
+			}
+			srvbuffer_desc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
+
+			hr = deviceData->m_device->CreateShaderResourceView(dBuf->getBuffer(), &srvbuffer_desc, dBuf->getSRVPtr());
+			ADLASSERT( hr == S_OK );
+		}
+		else if( type == BufferBase::BUFFER_APPEND )
+		{
+			D3D11_UNORDERED_ACCESS_VIEW_DESC desc;
+			ZeroMemory( &desc, sizeof(desc) );
+			desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
+			desc.Buffer.FirstElement = 0;
+
+			desc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_APPEND;
+
+			desc.Format = DXGI_FORMAT_UNKNOWN;      // Format must be must be DXGI_FORMAT_UNKNOWN, when creating a View of a Structured Buffer
+			desc.Buffer.NumElements = buffer_desc.ByteWidth / buffer_desc.StructureByteStride; 
+
+			hr = deviceData->m_device->CreateUnorderedAccessView( dBuf->getBuffer(), &desc, dBuf->getUAVPtr() );
+			ADLASSERT( hr == S_OK );
+		}
+	}
+//	else
+//	{
+//		ADLASSERT(0);
+//	}
+}
+
+template<typename T>
+void DeviceDX11::deallocate(Buffer<T>* buf)
+{
+	BufferDX11<T>* dBuf = (BufferDX11<T>*)buf;
+
+	if( dBuf->getBuffer() )
+	{
+		dBuf->getBuffer()->Release();
+		dBuf->m_ptr = NULL;
+	}
+	if( dBuf->getUAV() )
+	{
+		dBuf->getUAV()->Release();
+		dBuf->m_uav = NULL;
+	}
+	if( dBuf->getSRV() )
+	{
+		dBuf->getSRV()->Release();
+		dBuf->m_srv = NULL;
+	}
+	buf->m_device = 0;
+}
+
+template<typename T>
+void DeviceDX11::copy(Buffer<T>* dst, const Buffer<T>* src, int nElems)
+{
+	if( dst->m_device->m_type == TYPE_DX11 || src->m_device->m_type == TYPE_DX11 )
+	{
+		DeviceDX11* deviceData = this;
+		BufferDX11<T>* dDst = (BufferDX11<T>*)dst;
+		BufferDX11<T>* dSrc = (BufferDX11<T>*)src;
+
+		D3D11_MAPPED_SUBRESOURCE MappedVelResource = {0};
+
+		D3D11_BOX destRegion;
+		destRegion.left = 0*sizeof(T);
+		destRegion.front = 0;
+		destRegion.top = 0;
+		destRegion.bottom = 1;
+		destRegion.back = 1;
+		destRegion.right = (0+nElems)*sizeof(T);
+
+		deviceData->m_context->CopySubresourceRegion(
+				dDst->getBuffer(),
+				0, 0, 0, 0,
+				dSrc->getBuffer(),
+				0,
+				&destRegion );
+
+	}
+	else if( src->m_device->m_type == TYPE_HOST )
+	{
+		ADLASSERT( dst->getType() == TYPE_DX11 );
+		dst->write( src->m_ptr, nElems );
+	}
+	else if( dst->m_device->m_type == TYPE_HOST )
+	{
+		ADLASSERT( src->getType() == TYPE_DX11 );
+		src->read( dst->m_ptr, nElems );
+	}
+	else
+	{
+		ADLASSERT( 0 );
+	}
+}
+
+template<typename T>
+void DeviceDX11::copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems)
+{
+	DeviceDX11* deviceData = this;
+	BufferDX11<T>* dSrc = (BufferDX11<T>*)src;
+	Buffer<T> sBuf( deviceData, nElems, BufferBase::BUFFER_STAGING );
+	BufferDX11<T>* dStagingBuf = (BufferDX11<T>*)&sBuf;
+
+
+	ID3D11Buffer *StagingBuffer = dStagingBuf->getBuffer();
+    D3D11_MAPPED_SUBRESOURCE MappedVelResource = {0};
+
+    D3D11_BOX destRegion;
+    destRegion.left = srcOffsetNElems*sizeof(T);
+    destRegion.front = 0;
+    destRegion.top = 0;
+    destRegion.bottom = 1;
+    destRegion.back = 1;
+    destRegion.right = (srcOffsetNElems+nElems)*sizeof(T);
+
+    deviceData->m_context->CopySubresourceRegion(
+            StagingBuffer,
+            0, 0, 0, 0,
+			dSrc->getBuffer(),
+            0,
+            &destRegion);
+
+    deviceData->m_context->Map(StagingBuffer, 0, D3D11_MAP_READ, 0, &MappedVelResource);
+    memcpy(dst, MappedVelResource.pData, nElems*sizeof(T));
+    deviceData->m_context->Unmap(StagingBuffer, 0);
+}
+
+template<typename T>
+void DeviceDX11::copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems)
+{
+	BufferDX11<T>* dBuf = (BufferDX11<T>*)dst;
+
+	DeviceDX11* deviceData = this;
+
+    D3D11_BOX destRegion;
+    destRegion.left = dstOffsetNElems*sizeof(T);
+    destRegion.front = 0;
+    destRegion.top = 0;
+    destRegion.bottom = 1;
+    destRegion.back = 1;
+    destRegion.right = (dstOffsetNElems+nElems)*sizeof(T);
+	deviceData->m_context->UpdateSubresource(dBuf->getBuffer(), 0, &destRegion, src, 0, 0);
+}
+
+void DeviceDX11::waitForCompletion() const
+{
+	const DeviceDX11* deviceData = this;
+
+	ID3D11Query* syncQuery;
+	D3D11_QUERY_DESC qDesc;
+	qDesc.Query = D3D11_QUERY_EVENT;
+	qDesc.MiscFlags = 0;
+	deviceData->m_device->CreateQuery( &qDesc, &syncQuery );
+	deviceData->m_context->End( syncQuery );
+	while( deviceData->m_context->GetData( syncQuery, 0,0,0 ) == S_FALSE ){}
+	syncQuery->Release();
+}
+
+int DeviceDX11::getNDevices()
+{
+	IDXGIFactory1* factory = NULL;
+	IDXGIAdapter1* adapter = NULL;
+	CreateDXGIFactory1( __uuidof(IDXGIFactory1), (void**)&factory );
+
+	u32 i = 0;
+	while( factory->EnumAdapters1( i, &adapter ) != DXGI_ERROR_NOT_FOUND )
+	{
+		i++;
+	}
+
+	factory->Release();
+	return i;
+}
+
+void DeviceDX11::getDeviceName( char nameOut[128] ) const
+{
+	IDXGIAdapter* adapter;// = getAdapterFromDevice( this );
+	{
+		IDXGIDevice* pDXGIDevice;
+
+		ADLASSERT( m_device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice) == S_OK );
+		ADLASSERT( pDXGIDevice->GetParent(__uuidof(IDXGIAdapter), (void **)&adapter) == S_OK );
+
+		pDXGIDevice->Release();
+	}
+	DXGI_ADAPTER_DESC adapterDesc;
+	adapter->GetDesc( &adapterDesc );
+
+//	wcstombs( nameOut, adapterDesc.Description, 128 );
+	size_t	i;
+	wcstombs_s( &i, nameOut, 128, adapterDesc.Description, 128 );
+}
+
+Kernel* DeviceDX11::getKernel(const char* fileName, const char* funcName, const char* option, const char* src, bool cacheKernel ) const
+{
+	return m_kernelManager->query( this, fileName, funcName, option, src, cacheKernel );
+}
+
+#undef u32
+
+#undef SAFE_RELEASE
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/DX11/AdlKernelUtilsDX11.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/DX11/AdlKernelUtilsDX11.inl
@@ -0,0 +1,348 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+namespace adl
+{
+
+#define SAFE_RELEASE(p)      { if(p) { (p)->Release(); (p)=NULL; } }
+
+struct KernelDX11 : public Kernel
+{
+	ID3D11ComputeShader* getKernel() { return (ID3D11ComputeShader*)m_kernel; }
+	ID3D11ComputeShader** getKernelPtr() { return (ID3D11ComputeShader**)&m_kernel; }
+};
+
+
+__inline
+#ifdef UNICODE
+HRESULT FindDXSDKShaderFileCch( __in_ecount(cchDest) WCHAR* strDestPath,
+                                int cchDest, 
+                                __in LPCWSTR strFilename )
+#else
+HRESULT FindDXSDKShaderFileCch( __in_ecount(cchDest) CHAR* strDestPath,
+                                int cchDest, 
+                                __in LPCSTR strFilename )
+#endif
+{
+    if( NULL == strFilename || strFilename[0] == 0 || NULL == strDestPath || cchDest < 10 )
+        return E_INVALIDARG;
+
+    // Get the exe name, and exe path
+#ifdef UNICODE
+    WCHAR strExePath[MAX_PATH] =
+#else
+    CHAR strExePath[MAX_PATH] =
+#endif
+    {
+        0
+    };
+#ifdef UNICODE
+    WCHAR strExeName[MAX_PATH] =
+#else
+    CHAR strExeName[MAX_PATH] =
+#endif
+    {
+        0
+    };
+#ifdef UNICODE
+    WCHAR* strLastSlash = NULL;
+#else
+    CHAR* strLastSlash = NULL;
+#endif
+    GetModuleFileName( NULL, strExePath, MAX_PATH );
+    strExePath[MAX_PATH - 1] = 0;
+#ifdef UNICODE
+    strLastSlash = wcsrchr( strExePath, TEXT( '\\' ) );
+#else
+    strLastSlash = strrchr( strExePath, TEXT( '\\' ) );
+#endif
+    if( strLastSlash )
+    {
+#ifdef UNICODE
+        wcscpy_s( strExeName, MAX_PATH, &strLastSlash[1] );
+#else
+
+#endif
+        // Chop the exe name from the exe path
+        *strLastSlash = 0;
+
+        // Chop the .exe from the exe name
+#ifdef UNICODE
+        strLastSlash = wcsrchr( strExeName, TEXT( '.' ) );
+#else
+        strLastSlash = strrchr( strExeName, TEXT( '.' ) );
+#endif
+        if( strLastSlash )
+            *strLastSlash = 0;
+    }
+
+    // Search in directories:
+    //      .\
+    //      %EXE_DIR%\..\..\%EXE_NAME%
+#ifdef UNICODE
+    wcscpy_s( strDestPath, cchDest, strFilename );
+#else
+	strcpy_s( strDestPath, cchDest, strFilename );
+#endif
+    if( GetFileAttributes( strDestPath ) != 0xFFFFFFFF )
+        return S_OK;
+
+//    swprintf_s( strDestPath, cchDest, L"%s\\..\\..\\%s\\%s", strExePath, strExeName, strFilename );
+#ifdef UNICODE
+    swprintf_s( strDestPath, cchDest, L"%s\\..\\%s\\%s", strExePath, strExeName, strFilename );
+#else
+    sprintf_s( strDestPath, cchDest, "%s\\..\\%s\\%s", strExePath, strExeName, strFilename );
+#endif
+    if( GetFileAttributes( strDestPath ) != 0xFFFFFFFF )
+        return S_OK;    
+
+    // On failure, return the file as the path but also return an error code
+#ifdef UNICODE
+    wcscpy_s( strDestPath, cchDest, strFilename );
+#else
+    strcpy_s( strDestPath, cchDest, strFilename );
+#endif
+
+	ADLASSERT( 0 );
+
+    return E_FAIL;
+}
+
+
+
+
+template<>
+void KernelBuilder<TYPE_DX11>::setFromFile( const Device* deviceData, const char* fileName, const char* option, bool addExtension,
+	bool cacheKernel)
+{
+	char fileNameWithExtension[256];
+
+	if( addExtension )
+		sprintf_s( fileNameWithExtension, "%s.hlsl", fileName );
+	else
+		sprintf_s( fileNameWithExtension, "%s", fileName );
+
+	m_deviceData = deviceData;
+
+	int nameLength = (int)strlen(fileNameWithExtension)+1;
+#ifdef UNICODE
+	WCHAR* wfileNameWithExtension = new WCHAR[nameLength];
+#else
+	CHAR* wfileNameWithExtension = new CHAR[nameLength];
+#endif
+	memset(wfileNameWithExtension,0,nameLength);
+#ifdef UNICODE
+	MultiByteToWideChar(CP_ACP,0,fileNameWithExtension,-1, wfileNameWithExtension, nameLength);
+#else
+	sprintf_s(wfileNameWithExtension, nameLength, "%s", fileNameWithExtension);
+#endif
+//			swprintf_s(wfileNameWithExtension, nameLength*2, L"%s", fileNameWithExtension);
+
+	HRESULT hr;
+
+	// Finds the correct path for the shader file.
+	// This is only required for this sample to be run correctly from within the Sample Browser,
+	// in your own projects, these lines could be removed safely
+	hr = FindDXSDKShaderFileCch( m_path, MAX_PATH, wfileNameWithExtension );
+
+	delete [] wfileNameWithExtension;
+
+	ADLASSERT( hr == S_OK );
+}
+
+template<>
+void KernelBuilder<TYPE_DX11>::setFromSrc( const Device* deviceData, const char* src, const char* option )
+{
+	m_deviceData = deviceData;
+	m_ptr = (void*)src;
+	m_path[0] = '0';
+}
+
+template<>
+KernelBuilder<TYPE_DX11>::~KernelBuilder()
+{
+
+}
+
+template<>
+void KernelBuilder<TYPE_DX11>::createKernel( const char* funcName, Kernel& kernelOut )
+{
+	const DeviceDX11* deviceData = (const DeviceDX11*)m_deviceData;
+	KernelDX11* dxKernel = (KernelDX11*)&kernelOut;
+	HRESULT hr;
+
+	DWORD dwShaderFlags = D3DCOMPILE_ENABLE_STRICTNESS;
+#if defined( DEBUG ) || defined( _DEBUG )
+	// Set the D3DCOMPILE_DEBUG flag to embed debug information in the shaders.
+	// Setting this flag improves the shader debugging experience, but still allows 
+	// the shaders to be optimized and to run exactly the way they will run in 
+	// the release configuration of this program.
+	dwShaderFlags |= D3DCOMPILE_DEBUG;
+#endif
+
+	const D3D_SHADER_MACRO defines[] = 
+	{
+#ifdef USE_STRUCTURED_BUFFERS
+		"USE_STRUCTURED_BUFFERS", "1",
+#endif
+
+#ifdef TEST_DOUBLE
+		"TEST_DOUBLE", "1",
+#endif
+		NULL, NULL
+	};
+
+	// We generally prefer to use the higher CS shader profile when possible as CS 5.0 is better performance on 11-class hardware
+	LPCSTR pProfile = ( deviceData->m_device->GetFeatureLevel() >= D3D_FEATURE_LEVEL_11_0 ) ? "cs_5_0" : "cs_4_0";
+
+	ID3DBlob* pErrorBlob = NULL;
+	ID3DBlob* pBlob = NULL;
+	if( m_path[0] == '0' )
+	{
+		char* src = (char*)m_ptr;
+		hr = D3DX11CompileFromMemory( src, strlen(src), 0, defines, NULL, funcName, pProfile, 
+			dwShaderFlags, NULL, NULL, &pBlob, &pErrorBlob, NULL );
+	}
+	else
+	{
+		hr = D3DX11CompileFromFile( m_path, defines, NULL, funcName, pProfile, 
+			dwShaderFlags, NULL, NULL, &pBlob, &pErrorBlob, NULL );
+	}
+
+	if ( FAILED(hr) )
+	{
+		debugPrintf("%s", (char*)pErrorBlob->GetBufferPointer());
+	}
+	ADLASSERT( hr == S_OK );
+
+	hr = deviceData->m_device->CreateComputeShader( pBlob->GetBufferPointer(), pBlob->GetBufferSize(), NULL, 
+		dxKernel->getKernelPtr() );
+
+#if defined(DEBUG) || defined(PROFILE)
+	if ( kernelOut.m_kernel )
+		kernelOut.m_kernel->SetPrivateData( WKPDID_D3DDebugObjectName, lstrlenA(pFunctionName), pFunctionName );
+#endif
+
+	SAFE_RELEASE( pErrorBlob );
+	SAFE_RELEASE( pBlob );
+
+	kernelOut.m_type = TYPE_DX11;
+}
+
+template<>
+void KernelBuilder<TYPE_DX11>::deleteKernel( Kernel& kernel )
+{
+	KernelDX11* dxKernel = (KernelDX11*)&kernel;
+
+	if( kernel.m_kernel )
+	{
+		dxKernel->getKernel()->Release();
+		kernel.m_kernel = NULL;
+	}
+}
+
+
+
+class LauncherDX11
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		__inline
+		static void setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n );
+		template<typename T>
+		__inline
+		static void setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts );
+		__inline
+		static void launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY );
+};
+
+void LauncherDX11::setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n )
+{
+	KernelDX11* dxKernel = (KernelDX11*)launcher->m_kernel;
+	const DeviceDX11* dddx = (const DeviceDX11*)launcher->m_deviceData;
+
+	for(int i=0; i<n; i++)
+	{
+		BufferDX11<int>* dBuf = (BufferDX11<int>*)buffInfo[i].m_buffer;
+		if( buffInfo[i].m_isReadOnly )
+		{
+			dddx->m_context->CSSetShaderResources( launcher->m_idx++, 1, dBuf->getSRVPtr() );
+		}
+		else
+		{
+			//	todo. cannot initialize append buffer with proper counter value which is the last arg
+			dddx->m_context->CSSetUnorderedAccessViews( launcher->m_idxRw++, 1, dBuf->getUAVPtr(), 0 );
+		}
+	}
+}
+
+template<typename T>
+void LauncherDX11::setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts )
+{
+	KernelDX11* dxKernel = (KernelDX11*)launcher->m_kernel;
+	const DeviceDX11* dddx = (const DeviceDX11*)launcher->m_deviceData;
+	BufferDX11<T>* dBuf = (BufferDX11<T>*)&constBuff;
+/*
+    D3D11_MAPPED_SUBRESOURCE MappedResource;
+	dddx->m_context->Map( dBuf->getBuffer(), 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
+    memcpy( MappedResource.pData, &consts, sizeof(T) );
+	dddx->m_context->Unmap( dBuf->getBuffer(), 0 );
+*/
+
+	dddx->m_context->UpdateSubresource( dBuf->getBuffer(), 0, NULL, &consts, 0, 0 );
+
+	dddx->m_context->CSSetConstantBuffers( 0, 1, dBuf->getBufferPtr() );
+}
+
+void LauncherDX11::launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
+{
+	KernelDX11* dxKernel = (KernelDX11*)launcher->m_kernel;
+	const DeviceDX11* dddx = (const DeviceDX11*)launcher->m_deviceData;
+
+	dddx->m_context->CSSetShader( dxKernel->getKernel(), NULL, 0 );
+
+	int nx, ny, nz;
+	nx = max( 1, (numThreadsX/localSizeX)+(!(numThreadsX%localSizeX)?0:1) );
+	ny = max( 1, (numThreadsY/localSizeY)+(!(numThreadsY%localSizeY)?0:1) );
+	nz = 1;
+
+	dddx->m_context->Dispatch( nx, ny, nz );
+
+	//	set 0 to registers
+	{
+	    dddx->m_context->CSSetShader( NULL, NULL, 0 );
+
+		if( launcher->m_idxRw )
+		{
+			ID3D11UnorderedAccessView* aUAViewsNULL[ 16 ] = { 0 };
+			dddx->m_context->CSSetUnorderedAccessViews( 0, 
+				min( (unsigned int)launcher->m_idxRw, sizeof(aUAViewsNULL)/sizeof(*aUAViewsNULL) ), aUAViewsNULL, NULL );
+		}
+
+		if( launcher->m_idx )
+		{
+			ID3D11ShaderResourceView* ppSRVNULL[16] = { 0 };
+			dddx->m_context->CSSetShaderResources( 0, 
+				min( (unsigned int)launcher->m_idx, sizeof(ppSRVNULL)/sizeof(*ppSRVNULL) ), ppSRVNULL );
+		}
+	}
+}
+
+#undef SAFE_RELEASE
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/DX11/AdlStopwatchDX11.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/DX11/AdlStopwatchDX11.inl
@@ -0,0 +1,131 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+namespace adl
+{
+
+struct StopwatchDX11 : public StopwatchBase
+{
+	public:
+		__inline
+		StopwatchDX11() : StopwatchBase(){}
+		__inline
+		~StopwatchDX11();
+
+		__inline
+		void init( const Device* deviceData );
+		__inline
+		void start();
+		__inline
+		void split();
+		__inline
+		void stop();
+		__inline
+		float getMs(int index=0);
+		__inline
+		void getMs( float* times, int capacity );
+
+	public:
+		ID3D11Query* m_tQuery[CAPACITY+1];
+		ID3D11Query* m_fQuery;
+		UINT64 m_t[CAPACITY];
+};
+
+void StopwatchDX11::init( const Device* deviceData )
+{
+	ADLASSERT( deviceData->m_type == TYPE_DX11 );
+	m_device = deviceData;
+	{
+		D3D11_QUERY_DESC qDesc;
+		qDesc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT;
+		qDesc.MiscFlags = 0;
+		((const DeviceDX11*)m_device)->m_device->CreateQuery( &qDesc, &m_fQuery );
+	}
+	for(int i=0; i<CAPACITY+1; i++)
+	{
+		D3D11_QUERY_DESC qDesc;
+		qDesc.Query = D3D11_QUERY_TIMESTAMP;
+		qDesc.MiscFlags = 0;
+		((const DeviceDX11*)m_device)->m_device->CreateQuery( &qDesc, &m_tQuery[i] );
+	}
+}
+
+StopwatchDX11::~StopwatchDX11()
+{
+	m_fQuery->Release();
+	for(int i=0; i<CAPACITY+1; i++)
+	{
+		m_tQuery[i]->Release();
+	}
+}
+
+void StopwatchDX11::start()
+{
+	m_idx = 0;
+	((const DeviceDX11*)m_device)->m_context->Begin( m_fQuery );
+	((const DeviceDX11*)m_device)->m_context->End( m_tQuery[m_idx++] );
+}
+
+void StopwatchDX11::split()
+{
+	if( m_idx < CAPACITY )
+		((const DeviceDX11*)m_device)->m_context->End( m_tQuery[m_idx++] );
+}
+
+void StopwatchDX11::stop()
+{
+	((const DeviceDX11*)m_device)->m_context->End( m_tQuery[m_idx++] );
+	((const DeviceDX11*)m_device)->m_context->End( m_fQuery );
+}
+
+float StopwatchDX11::getMs(int index)
+{
+	D3D11_QUERY_DATA_TIMESTAMP_DISJOINT d;
+//	m_deviceData->m_context->End( m_fQuery );
+	while( ((const DeviceDX11*)m_device)->m_context->GetData( m_fQuery, &d,sizeof(D3D11_QUERY_DATA_TIMESTAMP_DISJOINT),0 ) == S_FALSE ) {}
+
+	while( ((const DeviceDX11*)m_device)->m_context->GetData( m_tQuery[0], &m_t[index],sizeof(UINT64),0 ) == S_FALSE ){}
+	while( ((const DeviceDX11*)m_device)->m_context->GetData( m_tQuery[1], &m_t[index+1],sizeof(UINT64),0 ) == S_FALSE ){}
+
+	ADLASSERT( d.Disjoint == false );
+
+	float elapsedMs = (m_t[index+1] - m_t[index])/(float)d.Frequency*1000;
+	return elapsedMs;
+
+}
+
+void StopwatchDX11::getMs( float* times, int capacity )
+{
+	ADLASSERT( capacity <= CAPACITY );
+
+	D3D11_QUERY_DATA_TIMESTAMP_DISJOINT d;
+	while( ((const DeviceDX11*)m_device)->m_context->GetData( m_fQuery, &d,sizeof(D3D11_QUERY_DATA_TIMESTAMP_DISJOINT),0 ) == S_FALSE ) {}
+
+	for(int i=0; i<m_idx; i++)
+	{
+		while( ((const DeviceDX11*)m_device)->m_context->GetData( m_tQuery[i], &m_t[i],sizeof(UINT64),0 ) == S_FALSE ){}
+	}
+
+	ADLASSERT( d.Disjoint == false );
+
+	for(int i=0; i<capacity; i++)
+	{
+		times[i] = (m_t[i+1] - m_t[i])/(float)d.Frequency*1000;
+	}
+}
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Host/AdlHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Host/AdlHost.inl
@@ -0,0 +1,107 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+namespace adl
+{
+
+struct DeviceHost : public Device
+{
+	DeviceHost() : Device( TYPE_HOST ){}
+
+	__inline
+	void initialize(const Config& cfg);
+	__inline
+	void release();
+
+	template<typename T>
+	__inline
+	void allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type);
+
+	template<typename T>
+	__inline
+	void deallocate(Buffer<T>* buf);
+
+	template<typename T>
+	__inline
+	void copy(Buffer<T>* dst, const Buffer<T>* src, int nElems);
+
+	template<typename T>
+	__inline
+	void copy(T* dst, const Buffer<T>* src, int nElems, int offsetNElems = 0);
+
+	template<typename T>
+	__inline
+	void copy(Buffer<T>* dst, const T* src, int nElems, int offsetNElems = 0);
+
+	__inline
+	void waitForCompletion() const;
+};
+
+void DeviceHost::initialize(const Config& cfg)
+{
+
+}
+
+void DeviceHost::release()
+{
+
+}
+
+template<typename T>
+void DeviceHost::allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type)
+{
+	buf->m_device = this;
+
+	if( type == BufferBase::BUFFER_CONST ) return;
+
+	buf->m_ptr = new T[nElems];
+	ADLASSERT( buf->m_ptr );
+	buf->m_size = nElems;
+}
+
+template<typename T>
+void DeviceHost::deallocate(Buffer<T>* buf)
+{
+	if( buf->m_ptr ) delete [] buf->m_ptr;
+}
+
+template<typename T>
+void DeviceHost::copy(Buffer<T>* dst, const Buffer<T>* src, int nElems)
+{
+	copy( dst, src->m_ptr, nElems );
+}
+
+template<typename T>
+void DeviceHost::copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems)
+{
+	ADLASSERT( src->getType() == TYPE_HOST );
+	memcpy( dst, src->m_ptr+srcOffsetNElems, nElems*sizeof(T) );
+}
+
+template<typename T>
+void DeviceHost::copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems)
+{
+	ADLASSERT( dst->getType() == TYPE_HOST );
+	memcpy( dst->m_ptr+dstOffsetNElems, src, nElems*sizeof(T) );
+}
+
+void DeviceHost::waitForCompletion() const
+{
+
+}
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Host/AdlStopwatchHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Host/AdlStopwatchHost.inl
@@ -0,0 +1,119 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#ifdef _WIN32
+	#include <windows.h>
+#else
+	#include <sys/time.h>
+#endif
+
+namespace adl
+{
+
+class StopwatchHost : public StopwatchBase
+{
+	public:
+		__inline
+		StopwatchHost();
+		__inline
+		void init( const Device* deviceData );
+		__inline
+		void start();
+		__inline
+		void split();
+		__inline
+		void stop();
+		__inline
+		float getMs(int index=0);
+		__inline
+		void getMs( float* times, int capacity );
+
+	private:
+#ifdef _WIN32
+		LARGE_INTEGER m_frequency;
+		LARGE_INTEGER m_t[CAPACITY];
+#else
+		struct timeval mStartTime;
+		timeval m_t[CAPACITY];
+#endif
+};
+
+__inline
+StopwatchHost::StopwatchHost()
+ : StopwatchBase()
+{
+}
+
+__inline
+void StopwatchHost::init( const Device* deviceData )
+{
+	m_device = deviceData;
+#ifdef _WIN32
+	QueryPerformanceFrequency( &m_frequency );
+#else
+	gettimeofday(&mStartTime, 0);
+#endif
+}
+
+__inline
+void StopwatchHost::start()
+{
+	m_idx = 0;
+#ifdef _WIN32
+	QueryPerformanceCounter(&m_t[m_idx++]);
+#else
+	gettimeofday(&m_t[m_idx++], 0);
+#endif
+}
+
+__inline
+void StopwatchHost::split()
+{
+#ifdef _WIN32
+	QueryPerformanceCounter(&m_t[m_idx++]);
+#else
+	gettimeofday(&m_t[m_idx++], 0);
+#endif
+}
+
+__inline
+void StopwatchHost::stop()
+{
+	split();
+}
+
+__inline
+float StopwatchHost::getMs(int index)
+{
+#ifdef _WIN32
+	return (float)(1000*(m_t[index+1].QuadPart - m_t[index].QuadPart))/m_frequency.QuadPart;
+#else
+		return (m_t[index+1].tv_sec - m_t[index].tv_sec) * 1000 + 
+			(m_t[index+1].tv_usec - m_t[index].tv_usec) / 1000;
+#endif
+}
+
+__inline
+void StopwatchHost::getMs(float* times, int capacity)
+{
+	for(int i=0; i<capacity; i++) times[i] = 0.f;
+
+	for(int i=0; i<min(capacity, m_idx-1); i++)
+	{
+		times[i] = getMs(i);
+	}
+}
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.h
@@ -0,0 +1,73 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+
+namespace adl
+{
+
+class CopyBase
+{
+	public:
+		enum Option
+		{
+			PER_WI_1, 
+			PER_WI_2, 
+			PER_WI_4, 
+		};
+};
+
+template<DeviceType TYPE>
+class Copy : public CopyBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			const Device* m_device;
+			Kernel* m_copy1F4Kernel;
+			Kernel* m_copy2F4Kernel;
+			Kernel* m_copy4F4Kernel;
+			Kernel* m_copyF1Kernel;
+			Kernel* m_copyF2Kernel;
+			Buffer<int4>* m_constBuffer;
+		};
+
+		static
+		Data* allocate(const Device* deviceData);
+
+		static
+		void deallocate(Data* data);
+
+		static
+		void execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option = PER_WI_1);
+
+		static
+		void execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n);
+
+		static
+		void execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n);
+};
+
+
+#include <AdlPrimitives/Copy/CopyHost.inl>
+#include <AdlPrimitives/Copy/Copy.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.inl
@@ -0,0 +1,151 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Copy\\CopyKernels"
+#define KERNEL0 "Copy1F4Kernel"
+#define KERNEL1 "Copy2F4Kernel"
+#define KERNEL2 "Copy4F4Kernel"
+#define KERNEL3 "CopyF1Kernel"
+#define KERNEL4 "CopyF2Kernel"
+
+#include <AdlPrimitives/Copy/CopyKernelsCL.h>
+#include <AdlPrimitives/Copy/CopyKernelsDX11.h>
+
+
+template<DeviceType TYPE>
+typename Copy<TYPE>::Data* Copy<TYPE>::allocate( const Device* device )
+{
+	ADLASSERT( TYPE == device->m_type );
+
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+	{copyKernelsCL, copyKernelsDX11};
+//	ADLASSERT(0);
+#else
+	{0,0};
+#endif	
+
+	Data* data = new Data;
+	data->m_device = device;
+	data->m_copy1F4Kernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
+	data->m_copy2F4Kernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
+	data->m_copy4F4Kernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
+	data->m_copyF1Kernel = device->getKernel( PATH, KERNEL3, 0, src[TYPE] );
+	data->m_copyF2Kernel = device->getKernel( PATH, KERNEL4, 0, src[TYPE] );
+	data->m_constBuffer = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::deallocate( Data* data )
+{
+	delete data->m_constBuffer;
+	delete data;
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option )
+{
+	ADLASSERT( TYPE == dst.getType() );
+	ADLASSERT( TYPE == src.getType() );
+
+	int4 constBuffer;
+	constBuffer.x = n;
+
+	switch (option)
+	{
+	case PER_WI_1:
+		{
+			BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+			Launcher launcher( data->m_device, data->m_copy1F4Kernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( n/1 );
+		}
+		break;
+	case PER_WI_2:
+		{
+			ADLASSERT( n%2 == 0 );
+			BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+			Launcher launcher( data->m_device, data->m_copy2F4Kernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( n/2 );
+		}
+		break;
+	case PER_WI_4:
+		{
+			ADLASSERT( n%4 == 0 );
+			BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+			Launcher launcher( data->m_device, data->m_copy4F4Kernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( n/4 );
+		}
+		break;
+	default:
+		ADLASSERT(0);
+		break;
+	};
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n )
+{
+	ADLASSERT( TYPE == dst.getType() );
+	ADLASSERT( TYPE == src.getType() );
+
+	int4 constBuffer;
+	constBuffer.x = n;
+
+	BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+	Launcher launcher( data->m_device, data->m_copyF2Kernel );
+	launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+	launcher.setConst( *data->m_constBuffer, constBuffer );
+	launcher.launch1D( n/1 );
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n )
+{
+	ADLASSERT( TYPE == dst.getType() );
+	ADLASSERT( TYPE == src.getType() );
+
+	int4 constBuffer;
+	constBuffer.x = n;
+
+	BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+	Launcher launcher( data->m_device, data->m_copyF1Kernel );
+	launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+	launcher.setConst( *data->m_constBuffer, constBuffer );
+	launcher.launch1D( n/1 );
+}
+
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
+#undef KERNEL3
+#undef KERNEL4
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyHost.inl
@@ -0,0 +1,85 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+template<>
+class Copy<TYPE_HOST> : public CopyBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+		};
+
+		static
+		Data* allocate(const Device* deviceData)
+		{
+			ADLASSERT( TYPE_HOST == deviceData->m_type );
+			return 0;
+		}
+
+		static
+		void deallocate(Data* data)
+		{
+			return;
+		}
+
+		static
+		void execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option = PER_WI_1)
+		{
+			ADLASSERT( TYPE_HOST == dst.getType() );
+			ADLASSERT( TYPE_HOST == src.getType() );
+
+			HostBuffer<float4>& dstH = (HostBuffer<float4>&)dst;
+			HostBuffer<float4>& srcH = (HostBuffer<float4>&)src;
+
+			for(int i=0; i<n; i++)
+			{
+				dstH[i] = srcH[i];
+			}
+		}
+
+		static
+		void execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n)
+		{
+			ADLASSERT( TYPE_HOST == dst.getType() );
+			ADLASSERT( TYPE_HOST == src.getType() );
+
+			HostBuffer<float2>& dstH = (HostBuffer<float2>&)dst;
+			HostBuffer<float2>& srcH = (HostBuffer<float2>&)src;
+
+			for(int i=0; i<n; i++)
+			{
+				dstH[i] = srcH[i];
+			}
+		}
+
+		static
+		void execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n)
+		{
+			ADLASSERT( TYPE_HOST == dst.getType() );
+			ADLASSERT( TYPE_HOST == src.getType() );
+
+			HostBuffer<float>& dstH = (HostBuffer<float>&)dst;
+			HostBuffer<float>& srcH = (HostBuffer<float>&)src;
+
+			for(int i=0; i<n; i++)
+			{
+				dstH[i] = srcH[i];
+			}
+		}
+};
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernels.cl
@@ -0,0 +1,128 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+
+#define make_uint4 (uint4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+typedef struct
+{
+	int m_n;
+	int m_padding[3];
+} ConstBuffer;
+
+
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void Copy1F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		float4 a0 = src[gIdx];
+
+		dst[ gIdx ] = a0;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void Copy2F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( 2*gIdx <= cb.m_n )
+	{
+		float4 a0 = src[gIdx*2+0];
+		float4 a1 = src[gIdx*2+1];
+
+		dst[ gIdx*2+0 ] = a0;
+		dst[ gIdx*2+1 ] = a1;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void Copy4F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( 4*gIdx <= cb.m_n )
+	{
+		int idx0 = gIdx*4+0;
+		int idx1 = gIdx*4+1;
+		int idx2 = gIdx*4+2;
+		int idx3 = gIdx*4+3;
+
+		float4 a0 = src[idx0];
+		float4 a1 = src[idx1];
+		float4 a2 = src[idx2];
+		float4 a3 = src[idx3];
+
+		dst[ idx0 ] = a0;
+		dst[ idx1 ] = a1;
+		dst[ idx2 ] = a2;
+		dst[ idx3 ] = a3;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void CopyF1Kernel(__global float* dstF1, __global float* srcF1, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		float a0 = srcF1[gIdx];
+
+		dstF1[ gIdx ] = a0;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		float2 a0 = srcF2[gIdx];
+
+		dstF2[ gIdx ] = a0;
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernels.hlsl
@@ -0,0 +1,130 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+#define GROUP_MEM_FENCE
+#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
+#define AtomInc(x) InterlockedAdd(x, 1)
+#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
+
+#define make_uint4 uint4
+#define make_uint2 uint2
+#define make_int2 int2
+
+#define WG_SIZE 64
+
+#define GET_GROUP_SIZE WG_SIZE
+
+
+
+cbuffer CB : register( b0 )
+{
+	int m_n;
+	int m_padding[3];
+};
+
+RWStructuredBuffer<float4> dst : register( u0 );
+StructuredBuffer<float4> src : register( t0 );
+
+[numthreads(WG_SIZE, 1, 1)]
+void Copy1F4Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		float4 a0 = src[gIdx];
+
+		dst[ gIdx ] = a0;
+	}
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void Copy2F4Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( 2*gIdx <= m_n )
+	{
+		float4 a0 = src[gIdx*2+0];
+		float4 a1 = src[gIdx*2+1];
+
+		dst[ gIdx*2+0 ] = a0;
+		dst[ gIdx*2+1 ] = a1;
+	}
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void Copy4F4Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( 4*gIdx <= m_n )
+	{
+		int idx0 = gIdx*4+0;
+		int idx1 = gIdx*4+1;
+		int idx2 = gIdx*4+2;
+		int idx3 = gIdx*4+3;
+
+		float4 a0 = src[idx0];
+		float4 a1 = src[idx1];
+		float4 a2 = src[idx2];
+		float4 a3 = src[idx3];
+
+		dst[ idx0 ] = a0;
+		dst[ idx1 ] = a1;
+		dst[ idx2 ] = a2;
+		dst[ idx3 ] = a3;
+	}
+}
+
+RWStructuredBuffer<float> dstF1 : register( u0 );
+StructuredBuffer<float> srcF1 : register( t0 );
+
+[numthreads(WG_SIZE, 1, 1)]
+void CopyF1Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		float a0 = srcF1[gIdx];
+
+		dstF1[ gIdx ] = a0;
+	}
+
+}
+
+RWStructuredBuffer<float2> dstF2 : register( u0 );
+StructuredBuffer<float2> srcF2 : register( t0 );
+
+[numthreads(WG_SIZE, 1, 1)]
+void CopyF2Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		float2 a0 = srcF2[gIdx];
+
+		dstF2[ gIdx ] = a0;
+	}
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernelsCL.h
@@ -0,0 +1,119 @@
+static const char* copyKernelsCL= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"\n"
+"#define make_uint4 (uint4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	int m_n;\n"
+"	int m_padding[3];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx];\n"
+"\n"
+"		dst[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( 2*gIdx <= cb.m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx*2+0];\n"
+"		float4 a1 = src[gIdx*2+1];\n"
+"\n"
+"		dst[ gIdx*2+0 ] = a0;\n"
+"		dst[ gIdx*2+1 ] = a1;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( 4*gIdx <= cb.m_n )\n"
+"	{\n"
+"		int idx0 = gIdx*4+0;\n"
+"		int idx1 = gIdx*4+1;\n"
+"		int idx2 = gIdx*4+2;\n"
+"		int idx3 = gIdx*4+3;\n"
+"\n"
+"		float4 a0 = src[idx0];\n"
+"		float4 a1 = src[idx1];\n"
+"		float4 a2 = src[idx2];\n"
+"		float4 a3 = src[idx3];\n"
+"\n"
+"		dst[ idx0 ] = a0;\n"
+"		dst[ idx1 ] = a1;\n"
+"		dst[ idx2 ] = a2;\n"
+"		dst[ idx3 ] = a3;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float a0 = srcF1[gIdx];\n"
+"\n"
+"		dstF1[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float2 a0 = srcF2[gIdx];\n"
+"\n"
+"		dstF2[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernelsDX11.h
@@ -0,0 +1,120 @@
+static const char* copyKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"#define GROUP_MEM_FENCE\n"
+"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+"\n"
+"#define make_uint4 uint4\n"
+"#define make_uint2 uint2\n"
+"#define make_int2 int2\n"
+"\n"
+"#define WG_SIZE 64\n"
+"\n"
+"#define GET_GROUP_SIZE WG_SIZE\n"
+"\n"
+"\n"
+"\n"
+"cbuffer CB : register( b0 )\n"
+"{\n"
+"	int m_n;\n"
+"	int m_padding[3];\n"
+"};\n"
+"\n"
+"RWStructuredBuffer<float4> dst : register( u0 );\n"
+"StructuredBuffer<float4> src : register( t0 );\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void Copy1F4Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx];\n"
+"\n"
+"		dst[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void Copy2F4Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( 2*gIdx <= m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx*2+0];\n"
+"		float4 a1 = src[gIdx*2+1];\n"
+"\n"
+"		dst[ gIdx*2+0 ] = a0;\n"
+"		dst[ gIdx*2+1 ] = a1;\n"
+"	}\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void Copy4F4Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( 4*gIdx <= m_n )\n"
+"	{\n"
+"		int idx0 = gIdx*4+0;\n"
+"		int idx1 = gIdx*4+1;\n"
+"		int idx2 = gIdx*4+2;\n"
+"		int idx3 = gIdx*4+3;\n"
+"\n"
+"		float4 a0 = src[idx0];\n"
+"		float4 a1 = src[idx1];\n"
+"		float4 a2 = src[idx2];\n"
+"		float4 a3 = src[idx3];\n"
+"\n"
+"		dst[ idx0 ] = a0;\n"
+"		dst[ idx1 ] = a1;\n"
+"		dst[ idx2 ] = a2;\n"
+"		dst[ idx3 ] = a3;\n"
+"	}\n"
+"}\n"
+"\n"
+"RWStructuredBuffer<float> dstF1 : register( u0 );\n"
+"StructuredBuffer<float> srcF1 : register( t0 );\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void CopyF1Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		float a0 = srcF1[gIdx];\n"
+"\n"
+"		dstF1[ gIdx ] = a0;\n"
+"	}\n"
+"\n"
+"}\n"
+"\n"
+"RWStructuredBuffer<float2> dstF2 : register( u0 );\n"
+"StructuredBuffer<float2> srcF2 : register( t0 );\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void CopyF2Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		float2 a0 = srcF2[gIdx];\n"
+"\n"
+"		dstF2[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/Fill.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/Fill.h
@@ -0,0 +1,77 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+
+namespace adl
+{
+
+class FillBase
+{
+	public:
+		enum Option
+		{
+
+		};
+};
+
+template<DeviceType TYPE>
+class Fill
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct ConstData
+		{
+			int4 m_data;
+			int m_offset;
+			int m_n;
+			int m_padding[2];
+		};
+
+		struct Data
+		{
+			const Device* m_device;
+			Kernel* m_fillIntKernel;
+			Kernel* m_fillInt2Kernel;
+			Kernel* m_fillInt4Kernel;
+			Buffer<ConstData>* m_constBuffer;
+		};
+
+		static
+		Data* allocate(const Device* deviceData);
+
+		static
+		void deallocate(Data* data);
+
+		static
+		void execute(Data* data, Buffer<int>& src, const int& value, int n, int offset = 0);
+
+		static
+		void execute(Data* data, Buffer<int2>& src, const int2& value, int n, int offset = 0);
+
+		static
+		void execute(Data* data, Buffer<int4>& src, const int4& value, int n, int offset = 0);
+
+};
+
+
+#include <AdlPrimitives/Fill/FillHost.inl>
+#include <AdlPrimitives/Fill/Fill.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/Fill.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/Fill.inl
@@ -0,0 +1,123 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+//#define PATH "..\\..\\AdlPrimitives\\Fill\\FillKernels"
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Fill\\FillKernels"
+#define KERNEL0 "FillIntKernel"
+#define KERNEL1 "FillInt2Kernel"
+#define KERNEL2 "FillInt4Kernel"
+
+#include <AdlPrimitives/Fill/FillKernelsCL.h>
+#include <AdlPrimitives/Fill/FillKernelsDX11.h>
+
+
+template<DeviceType TYPE>
+typename Fill<TYPE>::Data* Fill<TYPE>::allocate( const Device* device )
+{
+	ADLASSERT( TYPE == device->m_type );
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{fillKernelsCL, fillKernelsDX11};
+#else
+		{0,0};
+#endif
+
+	Data* data = new Data;
+	data->m_device = device;
+	data->m_fillIntKernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
+	data->m_fillInt2Kernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
+	data->m_fillInt4Kernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
+	data->m_constBuffer = new Buffer<ConstData>( device, 1, BufferBase::BUFFER_CONST );
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void Fill<TYPE>::deallocate( Data* data )
+{
+	delete data->m_constBuffer;
+	delete data;
+}
+
+template<DeviceType TYPE>
+void Fill<TYPE>::execute(Data* data, Buffer<int>& src, const int& value, int n, int offset)
+{
+	ADLASSERT( n>0 );
+	ConstData constBuffer;
+	{
+		constBuffer.m_offset = offset;
+		constBuffer.m_n = n;
+		constBuffer.m_data = make_int4( value );
+	}
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( &src ) };
+
+		Launcher launcher( data->m_device, data->m_fillIntKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer, constBuffer );
+		launcher.launch1D( n );
+	}
+}
+
+template<DeviceType TYPE>
+void Fill<TYPE>::execute(Data* data, Buffer<int2>& src, const int2& value, int n, int offset)
+{
+	ADLASSERT( n>0 );
+	ConstData constBuffer;
+	{
+		constBuffer.m_offset = offset;
+		constBuffer.m_n = n;
+		constBuffer.m_data = make_int4( value.x, value.y, 0, 0 );
+	}
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( &src ) };
+
+		Launcher launcher( data->m_device, data->m_fillInt2Kernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer, constBuffer );
+		launcher.launch1D( n );
+	}
+}
+
+template<DeviceType TYPE>
+void Fill<TYPE>::execute(Data* data, Buffer<int4>& src, const int4& value, int n, int offset)
+{
+	ADLASSERT( n>0 );
+	ConstData constBuffer;
+	{
+		constBuffer.m_offset = offset;
+		constBuffer.m_n = n;
+		constBuffer.m_data = value;
+	}
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( &src ) };
+
+		Launcher launcher( data->m_device, data->m_fillInt4Kernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer, constBuffer );
+		launcher.launch1D( n );
+	}
+}
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillHost.inl
@@ -0,0 +1,99 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+template<>
+class Fill<TYPE_HOST>
+{
+	public:
+		struct Data
+		{
+		};
+
+		static
+		Data* allocate(const Device* deviceData)
+		{
+			return 0;
+		}
+
+		static
+		void deallocate(Data* data)
+		{
+
+		}
+
+		template<typename T>
+		static
+		void executeImpl(Data* data, Buffer<T>& src, const T& value, int n, int offset = 0)
+		{
+			ADLASSERT( src.getType() == TYPE_HOST );
+			ADLASSERT( src.m_size >= offset+n );
+			HostBuffer<T>& hSrc = (HostBuffer<T>&)src;
+
+			for(int idx=offset; idx<offset+n; idx++)
+			{
+				hSrc[idx] = value;
+			}
+		}
+
+		static
+		void execute(Data* data, Buffer<int>& src, const int& value, int n, int offset = 0)
+		{
+			executeImpl( data, src, value, n, offset );
+		}
+
+		static
+		void execute(Data* data, Buffer<int2>& src, const int2& value, int n, int offset = 0)
+		{
+			executeImpl( data, src, value, n, offset );
+		}
+
+		static
+		void execute(Data* data, Buffer<int4>& src, const int4& value, int n, int offset = 0)
+		{
+			executeImpl( data, src, value, n, offset );
+		}
+
+/*
+		static
+		void execute(Data* data, Buffer<int>& src, int value, int n, int offset = 0)
+		{
+			ADLASSERT( src.getType() == TYPE_HOST );
+			ADLASSERT( src.m_size <= offset+n );
+			HostBuffer<u32>& hSrc = (HostBuffer<u32>&)src;
+
+			for(int idx=offset; idx<offset+n; idx++)
+			{
+				src[i] = value;
+			}
+		}
+
+		static
+		void execute(Data* data, Buffer<int2>& src, const int2& value, int n, int offset = 0)
+		{
+			ADLASSERT( src.getType() == TYPE_HOST );
+			ADLASSERT( src.m_size <= offset+n );
+
+		}
+
+		static
+		void execute(Data* data, Buffer<int4>& src, const int4& value, int n, int offset = 0)
+		{
+			ADLASSERT( src.getType() == TYPE_HOST );
+			ADLASSERT( src.m_size <= offset+n );
+
+		}
+*/
+};
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernels.cl
@@ -0,0 +1,81 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+
+#define make_uint4 (uint4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+typedef struct
+{
+	int4 m_data;
+	int m_offset;
+	int m_n;
+	int m_padding[2];
+} ConstBuffer;
+
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void FillIntKernel(__global int* dstInt, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		dstInt[ cb.m_offset+gIdx ] = cb.m_data.x;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void FillInt2Kernel(__global int2* dstInt2, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		dstInt2[ cb.m_offset+gIdx ] = make_int2( cb.m_data.x, cb.m_data.y );
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void FillInt4Kernel(__global int4* dstInt4, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		dstInt4[ cb.m_offset+gIdx ] = cb.m_data;
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernels.hlsl
@@ -0,0 +1,79 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+#define GROUP_MEM_FENCE
+#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
+#define AtomInc(x) InterlockedAdd(x, 1)
+#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
+
+#define make_uint4 uint4
+#define make_uint2 uint2
+#define make_int2 int2
+
+
+cbuffer CB : register( b0 )
+{
+	int4 m_data;
+	int m_offset;
+	int m_n;
+	int m_padding[2];
+};
+
+
+RWStructuredBuffer<int> dstInt : register( u0 );
+
+[numthreads(64, 1, 1)]
+void FillIntKernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		dstInt[ m_offset+gIdx ] = m_data.x;
+	}
+}
+
+RWStructuredBuffer<int2> dstInt2 : register( u0 );
+
+[numthreads(64, 1, 1)]
+void FillInt2Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		dstInt2[ m_offset+gIdx ] = make_int2( m_data.x, m_data.y );
+	}
+}
+
+RWStructuredBuffer<int4> dstInt4 : register( u0 );
+
+[numthreads(64, 1, 1)]
+void FillInt4Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		dstInt4[ m_offset+gIdx ] = m_data;
+	}
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernelsCL.h
@@ -0,0 +1,71 @@
+static const char* fillKernelsCL= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"\n"
+"#define make_uint4 (uint4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	int4 m_data;\n"
+"	int m_offset;\n"
+"	int m_n;\n"
+"	int m_padding[2];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void FillIntKernel(__global int* dstInt, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		dstInt[ cb.m_offset+gIdx ] = cb.m_data.x;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void FillInt2Kernel(__global int2* dstInt2, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		dstInt2[ cb.m_offset+gIdx ] = make_int2( cb.m_data.x, cb.m_data.y );\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void FillInt4Kernel(__global int4* dstInt4, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		dstInt4[ cb.m_offset+gIdx ] = cb.m_data;\n"
+"	}\n"
+"}\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Fill/FillKernelsDX11.h
@@ -0,0 +1,69 @@
+static const char* fillKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"#define GROUP_MEM_FENCE\n"
+"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+"\n"
+"#define make_uint4 uint4\n"
+"#define make_uint2 uint2\n"
+"#define make_int2 int2\n"
+"\n"
+"\n"
+"cbuffer CB : register( b0 )\n"
+"{\n"
+"	int4 m_data;\n"
+"	int m_offset;\n"
+"	int m_n;\n"
+"	int m_padding[2];\n"
+"};\n"
+"\n"
+"\n"
+"RWStructuredBuffer<int> dstInt : register( u0 );\n"
+"\n"
+"[numthreads(64, 1, 1)]\n"
+"void FillIntKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		dstInt[ m_offset+gIdx ] = m_data.x;\n"
+"	}\n"
+"}\n"
+"\n"
+"RWStructuredBuffer<int2> dstInt2 : register( u0 );\n"
+"\n"
+"[numthreads(64, 1, 1)]\n"
+"void FillInt2Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		dstInt2[ m_offset+gIdx ] = make_int2( m_data.x, m_data.y );\n"
+"	}\n"
+"}\n"
+"\n"
+"RWStructuredBuffer<int4> dstInt4 : register( u0 );\n"
+"\n"
+"[numthreads(64, 1, 1)]\n"
+"void FillInt4Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		dstInt4[ m_offset+gIdx ] = m_data;\n"
+"	}\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Array.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Array.h
@@ -0,0 +1,231 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#ifndef ARRAY_H
+#define ARRAY_H
+
+#include <string.h>
+#include <malloc.h>
+#include <Common/Base/Error.h>
+#include <new.h>
+
+namespace adl
+{
+
+template <class T>
+class Array
+{
+	public:
+		__inline
+		Array();
+		__inline
+		Array(int size);
+		__inline
+		~Array();
+		__inline
+		T& operator[] (int idx);
+		__inline
+		const T& operator[] (int idx) const;
+		__inline
+		void pushBack(const T& elem);
+		__inline
+		void popBack();
+		__inline
+		void clear();
+		__inline
+		void setSize(int size);
+		__inline
+		int getSize() const;
+		__inline
+		T* begin();
+		__inline
+		const T* begin() const;
+		__inline
+		T* end();
+		__inline
+		const T* end() const;
+		__inline
+		int indexOf(const T& data) const;
+		__inline
+		void removeAt(int idx);
+		__inline
+		T& expandOne();
+
+	private:
+		Array(const Array& a){}
+
+	private:
+		enum
+		{
+			DEFAULT_SIZE = 128,
+			INCREASE_SIZE = 128,
+		};
+
+		T* m_data;
+		int m_size;
+		int m_capacity;
+};
+
+template<class T>
+Array<T>::Array()
+{
+	m_size = 0;
+	m_capacity = DEFAULT_SIZE;
+//	m_data = new T[ m_capacity ];
+	m_data = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+	for(int i=0; i<m_capacity; i++) new(&m_data[i])T;
+}
+
+template<class T>
+Array<T>::Array(int size)
+{
+	m_size = size;
+	m_capacity = size;
+//	m_data = new T[ m_capacity ];
+	m_data = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+	for(int i=0; i<m_capacity; i++) new(&m_data[i])T;
+}
+
+template<class T>
+Array<T>::~Array()
+{
+	if( m_data )
+	{
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = NULL;
+	}
+}
+
+template<class T>
+T& Array<T>::operator[](int idx)
+{
+	ADLASSERT(idx<m_size);
+	return m_data[idx];
+}
+
+template<class T>
+const T& Array<T>::operator[](int idx) const
+{
+	ADLASSERT(idx<m_size);
+	return m_data[idx];
+}
+
+template<class T>
+void Array<T>::pushBack(const T& elem)
+{
+	if( m_size == m_capacity )
+	{
+		int oldCap = m_capacity;
+		m_capacity += INCREASE_SIZE;
+//		T* s = new T[m_capacity];
+		T* s = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+		memcpy( s, m_data, sizeof(T)*oldCap );
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = s;
+	}
+	m_data[ m_size++ ] = elem;
+}
+
+template<class T>
+void Array<T>::popBack()
+{
+	ADLASSERT( m_size>0 );
+	m_size--;
+}
+
+template<class T>
+void Array<T>::clear()
+{
+	m_size = 0;
+}
+
+template<class T>
+void Array<T>::setSize(int size)
+{
+	if( size > m_capacity )
+	{
+		int oldCap = m_capacity;
+		m_capacity = size;
+//		T* s = new T[m_capacity];
+		T* s = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+		for(int i=0; i<m_capacity; i++) new(&s[i])T;
+		memcpy( s, m_data, sizeof(T)*oldCap );
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = s;
+	}
+	m_size = size;
+}
+
+template<class T>
+int Array<T>::getSize() const
+{
+	return m_size;
+}
+
+template<class T>
+const T* Array<T>::begin() const
+{
+	return m_data;
+}
+
+template<class T>
+T* Array<T>::begin()
+{
+	return m_data;
+}
+
+template<class T>
+T* Array<T>::end()
+{
+	return m_data+m_size;
+}
+
+template<class T>
+const T* Array<T>::end() const
+{
+	return m_data+m_size;
+}
+
+template<class T>
+int Array<T>::indexOf(const T& data) const
+{
+	for(int i=0; i<m_size; i++)
+	{
+		if( data == m_data[i] ) return i;
+	}
+	return -1;
+}
+
+template<class T>
+void Array<T>::removeAt(int idx)
+{
+	ADLASSERT(idx<m_size);
+	m_data[idx] = m_data[--m_size];
+}
+
+template<class T>
+T& Array<T>::expandOne()
+{
+	setSize( m_size+1 );
+	return m_data[ m_size-1 ];
+}
+
+};
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Float2.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Float2.inl
@@ -0,0 +1,173 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+__inline
+float2 make_float2(float x, float y)
+{
+	float2 v;
+	v.s[0] = x; v.s[1] = y;
+	return v;
+}
+
+__inline
+float2 make_float2(float x)
+{
+	return make_float2(x,x);
+}
+
+__inline
+float2 make_float2(const int2& x)
+{
+	return make_float2((float)x.s[0], (float)x.s[1]);
+}
+
+
+
+
+__inline
+float2 operator-(const float2& a)
+{
+	return make_float2(-a.x, -a.y);
+}
+
+__inline
+float2 operator*(const float2& a, const float2& b)
+{
+	float2 out;
+	out.s[0] = a.s[0]*b.s[0];
+	out.s[1] = a.s[1]*b.s[1];
+	return out;
+}
+
+__inline
+float2 operator*(float a, const float2& b)
+{
+	return make_float2(a*b.s[0], a*b.s[1]);
+}
+
+__inline
+float2 operator*(const float2& b, float a)
+{
+	return make_float2(a*b.s[0], a*b.s[1]);
+}
+
+__inline
+void operator*=(float2& a, const float2& b)
+{
+	a.s[0]*=b.s[0];
+	a.s[1]*=b.s[1];
+}
+
+__inline
+void operator*=(float2& a, float b)
+{
+	a.s[0]*=b;
+	a.s[1]*=b;
+}
+
+__inline
+float2 operator/(const float2& a, const float2& b)
+{
+	float2 out;
+	out.s[0] = a.s[0]/b.s[0];
+	out.s[1] = a.s[1]/b.s[1];
+	return out;
+}
+
+__inline
+float2 operator/(const float2& b, float a)
+{
+	return make_float2(b.s[0]/a, b.s[1]/a);
+}
+
+__inline
+void operator/=(float2& a, const float2& b)
+{
+	a.s[0]/=b.s[0];
+	a.s[1]/=b.s[1];
+}
+
+__inline
+void operator/=(float2& a, float b)
+{
+	a.s[0]/=b;
+	a.s[1]/=b;
+}
+//
+
+__inline
+float2 operator+(const float2& a, const float2& b)
+{
+	float2 out;
+	out.s[0] = a.s[0]+b.s[0];
+	out.s[1] = a.s[1]+b.s[1];
+	return out;
+}
+
+__inline
+float2 operator+(const float2& a, float b)
+{
+	float2 out;
+	out.s[0] = a.s[0]+b;
+	out.s[1] = a.s[1]+b;
+	return out;
+}
+
+__inline
+float2 operator-(const float2& a, const float2& b)
+{
+	float2 out;
+	out.s[0] = a.s[0]-b.s[0];
+	out.s[1] = a.s[1]-b.s[1];
+	return out;
+}
+
+__inline
+float2 operator-(const float2& a, float b)
+{
+	float2 out;
+	out.s[0] = a.s[0]-b;
+	out.s[1] = a.s[1]-b;
+	return out;
+}
+
+__inline
+void operator+=(float2& a, const float2& b)
+{
+	a.s[0]+=b.s[0];
+	a.s[1]+=b.s[1];
+}
+
+__inline
+void operator+=(float2& a, float b)
+{
+	a.s[0]+=b;
+	a.s[1]+=b;
+}
+
+__inline
+void operator-=(float2& a, const float2& b)
+{
+	a.s[0]-=b.s[0];
+	a.s[1]-=b.s[1];
+}
+
+__inline
+void operator-=(float2& a, float b)
+{
+	a.s[0]-=b;
+	a.s[1]-=b;
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Float4.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Float4.inl
@@ -0,0 +1,375 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+//#define CHECK_ALIGNMENT(a) ADLASSERT((u32(&(a)) & 0xf) == 0);
+#define CHECK_ALIGNMENT(a) a;
+
+
+__inline
+float4 make_float4(float x, float y, float z, float w = 0.f)
+{
+	float4 v;
+	v.x = x; v.y = y; v.z = z; v.w = w;
+	return v;
+}
+
+__inline
+float4 make_float4(float x)
+{
+	return make_float4(x,x,x,x);
+}
+
+__inline
+float4 make_float4(const int4& x)
+{
+	return make_float4((float)x.s[0], (float)x.s[1], (float)x.s[2], (float)x.s[3]);
+}
+
+__inline
+int4 make_int4(int x, int y, int z, int w = 0)
+{
+	int4 v;
+	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	return v;
+}
+
+__inline
+int4 make_int4(int x)
+{
+	return make_int4(x,x,x,x);
+}
+
+__inline
+int4 make_int4(const float4& x)
+{
+	return make_int4((int)x.x, (int)x.y, (int)x.z, (int)x.w);
+}
+
+__inline
+int2 make_int2(int a, int b)
+{
+	int2 ans; ans.x = a; ans.y = b;
+	return ans;
+}
+
+__inline
+bool operator ==(const int2& a, const int2& b)
+{
+	return a.x==b.x && a.y==b.y;
+}
+
+__inline
+bool operator ==(const int4& a, const int4& b)
+{
+	return a.x==b.x && a.y==b.y && a.z==b.z && a.w==b.w;
+}
+
+__inline
+bool operator ==(const float2& a, const float2& b)
+{
+	return a.x==b.x && a.y==b.y;
+}
+
+__inline
+bool operator ==(const float4& a, const float4& b)
+{
+	return a.x==b.x && a.y==b.y && a.z==b.z && a.w==b.w;
+}
+
+__inline
+float4 operator-(const float4& a)
+{
+	return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+
+__inline
+float4 operator*(const float4& a, const float4& b)
+{
+//	ADLASSERT((u32(&a) & 0xf) == 0);
+
+	float4 out;
+	out.s[0] = a.s[0]*b.s[0];
+	out.s[1] = a.s[1]*b.s[1];
+	out.s[2] = a.s[2]*b.s[2];
+	out.s[3] = a.s[3]*b.s[3];
+	return out;
+}
+
+__inline
+float4 operator*(float a, const float4& b)
+{
+	return make_float4(a*b.s[0], a*b.s[1], a*b.s[2], a*b.s[3]);
+}
+
+__inline
+float4 operator*(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	return make_float4(a*b.s[0], a*b.s[1], a*b.s[2], a*b.s[3]);
+}
+
+__inline
+void operator*=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]*=b.s[0];
+	a.s[1]*=b.s[1];
+	a.s[2]*=b.s[2];
+	a.s[3]*=b.s[3];
+}
+
+__inline
+void operator*=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]*=b;
+	a.s[1]*=b;
+	a.s[2]*=b;
+	a.s[3]*=b;
+}
+/*
+__inline
+bool operator ==(const float4& a, const float4& b)
+{
+
+
+}
+*/
+//
+__inline
+float4 operator/(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]/b.s[0];
+	out.s[1] = a.s[1]/b.s[1];
+	out.s[2] = a.s[2]/b.s[2];
+	out.s[3] = a.s[3]/b.s[3];
+	return out;
+}
+
+__inline
+float4 operator/(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	return make_float4(b.s[0]/a, b.s[1]/a, b.s[2]/a, b.s[3]/a);
+}
+
+__inline
+void operator/=(float4& a, const float4& b)
+{
+	a.s[0]/=b.s[0];
+	a.s[1]/=b.s[1];
+	a.s[2]/=b.s[2];
+	a.s[3]/=b.s[3];
+}
+
+__inline
+void operator/=(float4& a, float b)
+{
+	ADLASSERT((u32(&a) & 0xf) == 0);
+
+	a.s[0]/=b;
+	a.s[1]/=b;
+	a.s[2]/=b;
+	a.s[3]/=b;
+}
+//
+
+__inline
+float4 operator+(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]+b.s[0];
+	out.s[1] = a.s[1]+b.s[1];
+	out.s[2] = a.s[2]+b.s[2];
+	out.s[3] = a.s[3]+b.s[3];
+	return out;
+}
+
+__inline
+float4 operator+(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]+b;
+	out.s[1] = a.s[1]+b;
+	out.s[2] = a.s[2]+b;
+	out.s[3] = a.s[3]+b;
+	return out;
+}
+
+__inline
+float4 operator-(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]-b.s[0];
+	out.s[1] = a.s[1]-b.s[1];
+	out.s[2] = a.s[2]-b.s[2];
+	out.s[3] = a.s[3]-b.s[3];
+	return out;
+}
+
+__inline
+float4 operator-(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]-b;
+	out.s[1] = a.s[1]-b;
+	out.s[2] = a.s[2]-b;
+	out.s[3] = a.s[3]-b;
+	return out;
+}
+
+__inline
+void operator+=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]+=b.s[0];
+	a.s[1]+=b.s[1];
+	a.s[2]+=b.s[2];
+	a.s[3]+=b.s[3];
+}
+
+__inline
+void operator+=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]+=b;
+	a.s[1]+=b;
+	a.s[2]+=b;
+	a.s[3]+=b;
+}
+
+__inline
+void operator-=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]-=b.s[0];
+	a.s[1]-=b.s[1];
+	a.s[2]-=b.s[2];
+	a.s[3]-=b.s[3];
+}
+
+__inline
+void operator-=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]-=b;
+	a.s[1]-=b;
+	a.s[2]-=b;
+	a.s[3]-=b;
+}
+
+
+
+
+
+__inline
+float4 cross3(const float4& a, const float4& b)
+{
+	return make_float4(a.s[1]*b.s[2]-a.s[2]*b.s[1], 
+		a.s[2]*b.s[0]-a.s[0]*b.s[2], 
+		a.s[0]*b.s[1]-a.s[1]*b.s[0], 
+		0);
+}
+
+__inline
+float dot3F4(const float4& a, const float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z;
+}
+
+__inline
+float length3(const float4& a)
+{
+	return sqrtf(dot3F4(a,a));
+}
+
+__inline
+float dot4(const float4& a, const float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
+}
+
+//	for height
+__inline
+float dot3w1(const float4& point, const float4& eqn)
+{
+	return point.x*eqn.x+point.y*eqn.y+point.z*eqn.z+eqn.w;
+}
+
+__inline
+float4 normalize3(const float4& a)
+{
+	float length = sqrtf(dot3F4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 normalize4(const float4& a)
+{
+	float length = sqrtf(dot4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 createEquation(const float4& a, const float4& b, const float4& c)
+{
+	float4 eqn;
+	float4 ab = b-a;
+	float4 ac = c-a;
+	eqn = normalize3( cross3(ab, ac) );
+	eqn.w = -dot3F4(eqn,a);
+	return eqn;
+}
+
+__inline
+float intersectPlaneLine( const float4& planeEqn, const float4& vec, const float4& orig )
+{
+	return (-planeEqn.w - dot3F4(planeEqn, orig))/dot3F4(planeEqn, vec);
+}
+
+template<>
+__inline
+float4 max2(const float4& a, const float4& b)
+{
+	return make_float4( max2(a.x,b.x), max2(a.y,b.y), max2(a.z,b.z), max2(a.w,b.w) );
+}
+
+template<>
+__inline
+float4 min2(const float4& a, const float4& b)
+{
+	return make_float4( min2(a.x,b.x), min2(a.y,b.y), min2(a.z,b.z), min2(a.w,b.w) );
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Math.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Math.h
@@ -0,0 +1,224 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef CL_MATH_H
+#define CL_MATH_H
+
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <xmmintrin.h>
+
+
+#include <Adl/Adl.h>
+
+#include <algorithm>
+#define pxSort std::sort
+
+#define PI       3.14159265358979323846f
+#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
+
+
+#define _MEM_CLASSALIGN16 __declspec(align(16))
+#define _MEM_ALIGNED_ALLOCATOR16 	void* operator new(size_t size) { return _aligned_malloc( size, 16 ); } \
+	void operator delete(void *p) { _aligned_free( p ); } \
+	void* operator new[](size_t size) { return _aligned_malloc( size, 16 ); } \
+	void operator delete[](void *p) { _aligned_free( p ); } \
+	void* operator new(size_t size, void* p) { return p; } \
+	void operator delete(void *p, void* pp) {} 
+
+namespace adl
+{
+
+template<class T>
+T nextPowerOf2(T n)
+{
+	n -= 1;
+	for(int i=0; i<sizeof(T)*8; i++)
+		n = n | (n>>i);
+	return n+1;
+}
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+_MEM_CLASSALIGN16
+struct float4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	union
+	{
+		struct
+		{
+			float x,y,z,w;
+		};
+		struct
+		{
+			float s[4];
+		};
+		__m128 m_quad;
+	};
+};
+
+_MEM_CLASSALIGN16
+struct int4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	union
+	{
+		struct
+		{
+			int x,y,z,w;
+		};
+		struct
+		{
+			int s[4];
+		};
+	};
+};
+
+_MEM_CLASSALIGN16
+struct uint4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	union
+	{
+		struct
+		{
+			u32 x,y,z,w;
+		};
+		struct
+		{
+			u32 s[4];
+		};
+	};
+};
+
+struct int2
+{
+	union
+	{
+		struct
+		{
+			int x,y;
+		};
+		struct
+		{
+			int s[2];
+		};
+	};
+};
+
+struct float2
+{
+	union
+	{
+		struct
+		{
+			float x,y;
+		};
+		struct
+		{
+			float s[2];
+		};
+	};
+};
+
+template<typename T>
+__inline
+T max2(const T& a, const T& b)
+{
+	return (a>b)? a:b;
+}
+
+template<typename T>
+__inline
+T min2(const T& a, const T& b)
+{
+	return (a<b)? a:b;
+}
+
+
+#include <AdlPrimitives/Math/Float4.inl>
+#include <AdlPrimitives/Math/Float2.inl>
+
+
+template<typename T>
+void swap2(T& a, T& b)
+{
+	T tmp = a;
+	a = b;
+	b = tmp;
+}
+
+
+__inline
+void seedRandom(int seed)
+{
+	srand( seed );
+}
+
+template<typename T>
+__inline
+T getRandom(const T& minV, const T& maxV)
+{
+	float r = (rand()%10000)/10000.f;
+	T range = maxV - minV;
+	return (T)(minV + r*range);
+}
+
+template<>
+__inline
+float4 getRandom(const float4& minV, const float4& maxV)
+{
+	float4 r = make_float4( (rand()%10000)/10000.f, (rand()%10000)/10000.f, (rand()%10000)/10000.f, (rand()%10000)/10000.f );
+	float4 range = maxV - minV;
+	return (minV + r*range);
+}
+
+
+
+template<typename T>
+T* addByteOffset(void* baseAddr, u32 offset)
+{
+	return (T*)(((u32)baseAddr)+offset);
+}
+
+
+struct Pair32
+{
+	Pair32(){}
+	Pair32(u32 a, u32 b) : m_a(a), m_b(b){}
+
+	u32 m_a;
+	u32 m_b;
+};
+
+struct PtrPair
+{
+	PtrPair(){}
+	PtrPair(void* a, void* b) : m_a(a), m_b(b){}
+	template<typename T>
+	PtrPair(T* a, T* b) : m_a((void*)a), m_b((void*)b){}
+
+	void* m_a;
+	void* m_b;
+};
+
+};
+
+#endif
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/MathCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/MathCL.h
@@ -0,0 +1,357 @@
+
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+
+
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+
+#define max2 max
+#define min2 min
+
+
+///////////////////////////////////////
+//	Vector
+///////////////////////////////////////
+__inline
+float fastDiv(float numerator, float denominator)
+{
+	return native_divide(numerator, denominator);	
+//	return numerator/denominator;	
+}
+
+__inline
+float4 fastDiv4(float4 numerator, float4 denominator)
+{
+	return native_divide(numerator, denominator);	
+}
+
+__inline
+float fastSqrtf(float f2)
+{
+	return native_sqrt(f2);
+//	return sqrt(f2);
+}
+
+__inline
+float fastRSqrt(float f2)
+{
+	return native_rsqrt(f2);
+}
+
+__inline
+float fastLength4(float4 v)
+{
+	return fast_length(v);
+}
+
+__inline
+float4 fastNormalize4(float4 v)
+{
+	return fast_normalize(v);
+}
+
+
+__inline
+float sqrtf(float a)
+{
+//	return sqrt(a);
+	return native_sqrt(a);
+}
+
+__inline
+float4 cross3(float4 a, float4 b)
+{
+	return cross(a,b);
+}
+
+__inline
+float dot3F4(float4 a, float4 b)
+{
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+}
+
+__inline
+float length3(const float4 a)
+{
+	return sqrtf(dot3F4(a,a));
+}
+
+__inline
+float dot4(const float4 a, const float4 b)
+{
+	return dot( a, b );
+}
+
+//	for height
+__inline
+float dot3w1(const float4 point, const float4 eqn)
+{
+	return dot3F4(point,eqn) + eqn.w;
+}
+
+__inline
+float4 normalize3(const float4 a)
+{
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+//	float length = sqrtf(dot3F4(a, a));
+//	return 1.f/length * a;
+}
+
+__inline
+float4 normalize4(const float4 a)
+{
+	float length = sqrtf(dot4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 createEquation(const float4 a, const float4 b, const float4 c)
+{
+	float4 eqn;
+	float4 ab = b-a;
+	float4 ac = c-a;
+	eqn = normalize3( cross3(ab, ac) );
+	eqn.w = -dot3F4(eqn,a);
+	return eqn;
+}
+
+///////////////////////////////////////
+//	Matrix3x3
+///////////////////////////////////////
+
+typedef struct
+{
+	float4 m_row[3];
+}Matrix3x3;
+
+__inline
+Matrix3x3 mtZero();
+
+__inline
+Matrix3x3 mtIdentity();
+
+__inline
+Matrix3x3 mtTranspose(Matrix3x3 m);
+
+__inline
+Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);
+
+__inline
+float4 mtMul1(Matrix3x3 a, float4 b);
+
+__inline
+float4 mtMul3(float4 a, Matrix3x3 b);
+
+__inline
+Matrix3x3 mtZero()
+{
+	Matrix3x3 m;
+	m.m_row[0] = (float4)(0.f);
+	m.m_row[1] = (float4)(0.f);
+	m.m_row[2] = (float4)(0.f);
+	return m;
+}
+
+__inline
+Matrix3x3 mtIdentity()
+{
+	Matrix3x3 m;
+	m.m_row[0] = (float4)(1,0,0,0);
+	m.m_row[1] = (float4)(0,1,0,0);
+	m.m_row[2] = (float4)(0,0,1,0);
+	return m;
+}
+
+__inline
+Matrix3x3 mtTranspose(Matrix3x3 m)
+{
+	Matrix3x3 out;
+	out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
+	out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
+	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
+	return out;
+}
+
+__inline
+Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
+{
+	Matrix3x3 transB;
+	transB = mtTranspose( b );
+	Matrix3x3 ans;
+	//	why this doesn't run when 0ing in the for{}
+	a.m_row[0].w = 0.f;
+	a.m_row[1].w = 0.f;
+	a.m_row[2].w = 0.f;
+	for(int i=0; i<3; i++)
+	{
+//	a.m_row[i].w = 0.f;
+		ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
+		ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
+		ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
+		ans.m_row[i].w = 0.f;
+	}
+	return ans;
+}
+
+__inline
+float4 mtMul1(Matrix3x3 a, float4 b)
+{
+	float4 ans;
+	ans.x = dot3F4( a.m_row[0], b );
+	ans.y = dot3F4( a.m_row[1], b );
+	ans.z = dot3F4( a.m_row[2], b );
+	ans.w = 0.f;
+	return ans;
+}
+
+__inline
+float4 mtMul3(float4 a, Matrix3x3 b)
+{
+	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
+	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
+	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
+
+	float4 ans;
+	ans.x = dot3F4( a, colx );
+	ans.y = dot3F4( a, coly );
+	ans.z = dot3F4( a, colz );
+	return ans;
+}
+
+///////////////////////////////////////
+//	Quaternion
+///////////////////////////////////////
+
+typedef float4 Quaternion;
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b);
+
+__inline
+Quaternion qtNormalize(Quaternion in);
+
+__inline
+float4 qtRotate(Quaternion q, float4 vec);
+
+__inline
+Quaternion qtInvert(Quaternion q);
+
+__inline
+Matrix3x3 qtGetRotationMatrix(Quaternion q);
+
+
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+}
+
+__inline
+Quaternion qtNormalize(Quaternion in)
+{
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+}
+__inline
+float4 qtRotate(Quaternion q, float4 vec)
+{
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+}
+
+__inline
+Quaternion qtInvert(Quaternion q)
+{
+	return (Quaternion)(-q.xyz, q.w);
+}
+
+__inline
+float4 qtInvRotate(const Quaternion q, float4 vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+__inline
+Matrix3x3 qtGetRotationMatrix(Quaternion quat)
+{
+	float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);
+	Matrix3x3 out;
+
+	out.m_row[0].x=1-2*quat2.y-2*quat2.z;
+	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;
+	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;
+	out.m_row[0].w = 0.f;
+
+	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;
+	out.m_row[1].y=1-2*quat2.x-2*quat2.z;
+	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;
+	out.m_row[1].w = 0.f;
+
+	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;
+	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;
+	out.m_row[2].z=1-2*quat2.x-2*quat2.y;
+	out.m_row[2].w = 0.f;
+
+	return out;
+}
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Matrix3x3.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Matrix3x3.h
@@ -0,0 +1,197 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#ifndef MATRIX3X3_H
+#define MATRIX3X3_H
+
+#include <AdlPrimitives/Math/Math.h>
+
+///////////////////////////////////////
+//	Matrix3x3
+///////////////////////////////////////
+namespace adl
+{
+
+typedef 
+_MEM_CLASSALIGN16 struct
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	float4 m_row[3];
+}Matrix3x3;
+
+__inline
+Matrix3x3 mtZero();
+
+__inline
+Matrix3x3 mtIdentity();
+
+__inline
+Matrix3x3 mtDiagonal(float a, float b, float c);
+
+__inline
+Matrix3x3 mtTranspose(const Matrix3x3& m);
+
+__inline
+Matrix3x3 mtMul(const Matrix3x3& a, const Matrix3x3& b);
+
+__inline
+float4 mtMul1(const Matrix3x3& a, const float4& b);
+
+__inline
+Matrix3x3 mtMul2(float a, const Matrix3x3& b);
+
+__inline
+float4 mtMul3(const float4& b, const Matrix3x3& a);
+
+__inline
+Matrix3x3 mtInvert(const Matrix3x3& m);
+
+__inline
+Matrix3x3 mtZero()
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(0.f);
+	m.m_row[1] = make_float4(0.f);
+	m.m_row[2] = make_float4(0.f);
+	return m;
+}
+
+__inline
+Matrix3x3 mtIdentity()
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(1,0,0);
+	m.m_row[1] = make_float4(0,1,0);
+	m.m_row[2] = make_float4(0,0,1);
+	return m;
+}
+
+__inline
+Matrix3x3 mtDiagonal(float a, float b, float c)
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(a,0,0);
+	m.m_row[1] = make_float4(0,b,0);
+	m.m_row[2] = make_float4(0,0,c);
+	return m;
+}
+
+__inline
+Matrix3x3 mtTranspose(const Matrix3x3& m)
+{
+	Matrix3x3 out;
+	out.m_row[0] = make_float4(m.m_row[0].s[0], m.m_row[1].s[0], m.m_row[2].s[0], 0.f);
+	out.m_row[1] = make_float4(m.m_row[0].s[1], m.m_row[1].s[1], m.m_row[2].s[1], 0.f);
+	out.m_row[2] = make_float4(m.m_row[0].s[2], m.m_row[1].s[2], m.m_row[2].s[2], 0.f);
+	return out;
+}
+
+__inline
+Matrix3x3 mtMul(const Matrix3x3& a, const Matrix3x3& b)
+{
+	Matrix3x3 transB;
+	transB = mtTranspose( b );
+	Matrix3x3 ans;
+	for(int i=0; i<3; i++)
+	{
+		ans.m_row[i].s[0] = dot3F4(a.m_row[i],transB.m_row[0]);
+		ans.m_row[i].s[1] = dot3F4(a.m_row[i],transB.m_row[1]);
+		ans.m_row[i].s[2] = dot3F4(a.m_row[i],transB.m_row[2]);
+	}
+	return ans;
+}
+
+__inline
+float4 mtMul1(const Matrix3x3& a, const float4& b)
+{
+	float4 ans;
+	ans.s[0] = dot3F4( a.m_row[0], b );
+	ans.s[1] = dot3F4( a.m_row[1], b );
+	ans.s[2] = dot3F4( a.m_row[2], b );
+	return ans;
+}
+
+__inline
+Matrix3x3 mtMul2(float a, const Matrix3x3& b)
+{
+	Matrix3x3 ans;
+	ans.m_row[0] = a*b.m_row[0];
+	ans.m_row[1] = a*b.m_row[1];
+	ans.m_row[2] = a*b.m_row[2];
+	return ans;
+}
+
+__inline
+float4 mtMul3(const float4& a, const Matrix3x3& b)
+{
+	float4 ans;
+	ans.x = a.x*b.m_row[0].x + a.y*b.m_row[1].x + a.z*b.m_row[2].x;
+	ans.y = a.x*b.m_row[0].y + a.y*b.m_row[1].y + a.z*b.m_row[2].y;
+	ans.z = a.x*b.m_row[0].z + a.y*b.m_row[1].z + a.z*b.m_row[2].z;
+	return ans;
+}
+
+__inline
+Matrix3x3 mtInvert(const Matrix3x3& m)
+{
+	float det = m.m_row[0].s[0]*m.m_row[1].s[1]*m.m_row[2].s[2]+m.m_row[1].s[0]*m.m_row[2].s[1]*m.m_row[0].s[2]+m.m_row[2].s[0]*m.m_row[0].s[1]*m.m_row[1].s[2]
+	-m.m_row[0].s[0]*m.m_row[2].s[1]*m.m_row[1].s[2]-m.m_row[2].s[0]*m.m_row[1].s[1]*m.m_row[0].s[2]-m.m_row[1].s[0]*m.m_row[0].s[1]*m.m_row[2].s[2];
+
+	ADLASSERT( det );
+
+	Matrix3x3 ans;
+	ans.m_row[0].s[0] = m.m_row[1].s[1]*m.m_row[2].s[2] - m.m_row[1].s[2]*m.m_row[2].s[1];
+	ans.m_row[0].s[1] = m.m_row[0].s[2]*m.m_row[2].s[1] - m.m_row[0].s[1]*m.m_row[2].s[2];
+	ans.m_row[0].s[2] = m.m_row[0].s[1]*m.m_row[1].s[2] - m.m_row[0].s[2]*m.m_row[1].s[1];
+	ans.m_row[0].w = 0.f;
+
+	ans.m_row[1].s[0] = m.m_row[1].s[2]*m.m_row[2].s[0] - m.m_row[1].s[0]*m.m_row[2].s[2];
+	ans.m_row[1].s[1] = m.m_row[0].s[0]*m.m_row[2].s[2] - m.m_row[0].s[2]*m.m_row[2].s[0];
+	ans.m_row[1].s[2] = m.m_row[0].s[2]*m.m_row[1].s[0] - m.m_row[0].s[0]*m.m_row[1].s[2];
+	ans.m_row[1].w = 0.f;
+
+	ans.m_row[2].s[0] = m.m_row[1].s[0]*m.m_row[2].s[1] - m.m_row[1].s[1]*m.m_row[2].s[0];
+	ans.m_row[2].s[1] = m.m_row[0].s[1]*m.m_row[2].s[0] - m.m_row[0].s[0]*m.m_row[2].s[1];
+	ans.m_row[2].s[2] = m.m_row[0].s[0]*m.m_row[1].s[1] - m.m_row[0].s[1]*m.m_row[1].s[0];
+	ans.m_row[2].w = 0.f;
+
+	ans = mtMul2((1.0f/det), ans);
+	return ans;
+}
+
+__inline
+Matrix3x3 mtSet( const float4& a, const float4& b, const float4& c )
+{
+	Matrix3x3 m;
+	m.m_row[0] = a;
+	m.m_row[1] = b;
+	m.m_row[2] = c;
+	return m;
+}
+
+__inline
+Matrix3x3 operator+(const Matrix3x3& a, const Matrix3x3& b)
+{
+	Matrix3x3 out;
+	out.m_row[0] = a.m_row[0] + b.m_row[0];
+	out.m_row[1] = a.m_row[1] + b.m_row[1];
+	out.m_row[2] = a.m_row[2] + b.m_row[2];
+	return out;
+}
+
+};
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Quaternion.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Math/Quaternion.h
@@ -0,0 +1,159 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef QUATERNION_H
+#define QUATERNION_H
+
+#include <AdlPrimitives/Math/Matrix3x3.h>
+
+namespace adl
+{
+
+typedef float4 Quaternion;
+
+__inline
+Quaternion qtSet(const float4& axis, float angle);
+
+__inline
+Quaternion qtMul(const Quaternion& a, const Quaternion& b);
+
+__inline
+float4 qtRotate(const Quaternion& q, const float4& vec);
+
+__inline
+float4 qtInvRotate(const Quaternion& q, const float4& vec);
+
+__inline
+Quaternion qtInvert(const Quaternion& q);
+
+__inline
+Matrix3x3 qtGetRotationMatrix(const Quaternion& quat);
+
+__inline
+Quaternion qtNormalize(const Quaternion& q);
+
+__inline
+Quaternion qtGetIdentity() { return make_float4(0,0,0,1); }
+
+__inline
+Quaternion qtSet(const float4& axis, float angle)
+{
+	float4 nAxis = normalize3( axis );
+
+	Quaternion q;
+	q.s[0] = nAxis.s[0]*sin(angle/2);
+	q.s[1] = nAxis.s[1]*sin(angle/2);
+	q.s[2] = nAxis.s[2]*sin(angle/2);
+	q.s[3] = cos(angle/2);
+	return q;
+}
+
+__inline
+Quaternion qtMul(const Quaternion& a, const Quaternion& b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.s[3]*b + b.s[3]*a;
+	ans.s[3] = a.s[3]*b.s[3] - (a.s[0]*b.s[0]+a.s[1]*b.s[1]+a.s[2]*b.s[2]);
+	return ans;
+}
+
+__inline
+float4 qtRotate(const Quaternion& q, const float4& vec)
+{
+	Quaternion vecQ = vec;
+	vecQ.s[3] = 0.f;
+	Quaternion qInv = qtInvert( q );
+	float4 out = qtMul(qtMul(q,vecQ),qInv);
+	return out;
+}
+
+__inline
+float4 qtInvRotate(const Quaternion& q, const float4& vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+__inline
+Quaternion qtInvert(const Quaternion& q)
+{
+	Quaternion ans;
+	ans.s[0] = -q.s[0];
+	ans.s[1] = -q.s[1];
+	ans.s[2] = -q.s[2];
+	ans.s[3] = q.s[3];
+	return ans;
+}
+
+__inline
+Matrix3x3 qtGetRotationMatrix(const Quaternion& quat)
+{
+	float4 quat2 = make_float4(quat.s[0]*quat.s[0], quat.s[1]*quat.s[1], quat.s[2]*quat.s[2], 0.f);
+	Matrix3x3 out;
+
+	out.m_row[0].s[0]=1-2*quat2.s[1]-2*quat2.s[2];
+	out.m_row[0].s[1]=2*quat.s[0]*quat.s[1]-2*quat.s[3]*quat.s[2];
+	out.m_row[0].s[2]=2*quat.s[0]*quat.s[2]+2*quat.s[3]*quat.s[1];
+	out.m_row[0].s[3] = 0.f;
+
+	out.m_row[1].s[0]=2*quat.s[0]*quat.s[1]+2*quat.s[3]*quat.s[2];
+	out.m_row[1].s[1]=1-2*quat2.s[0]-2*quat2.s[2];
+	out.m_row[1].s[2]=2*quat.s[1]*quat.s[2]-2*quat.s[3]*quat.s[0];
+	out.m_row[1].s[3] = 0.f;
+
+	out.m_row[2].s[0]=2*quat.s[0]*quat.s[2]-2*quat.s[3]*quat.s[1];
+	out.m_row[2].s[1]=2*quat.s[1]*quat.s[2]+2*quat.s[3]*quat.s[0];
+	out.m_row[2].s[2]=1-2*quat2.s[0]-2*quat2.s[1];
+	out.m_row[2].s[3] = 0.f;
+
+	return out;
+}
+
+__inline
+Quaternion qtGetQuaternion(const Matrix3x3* m)
+{
+	Quaternion q;
+	q.w = sqrtf( m[0].m_row[0].x + m[0].m_row[1].y + m[0].m_row[2].z + 1 ) * 0.5f;
+	float inv4w = 1.f/(4.f*q.w);
+	q.x = (m[0].m_row[2].y-m[0].m_row[1].z)*inv4w;
+	q.y = (m[0].m_row[0].z-m[0].m_row[2].x)*inv4w;
+	q.z = (m[0].m_row[1].x-m[0].m_row[0].y)*inv4w;
+
+	return q;
+}
+
+__inline
+Quaternion qtNormalize(const Quaternion& q)
+{
+	return normalize4(q);
+}
+
+__inline
+float4 transform(const float4& p, const float4& translation, const Quaternion& orientation)
+{
+	return qtRotate( orientation, p ) + translation;
+}
+
+__inline
+float4 invTransform(const float4& p, const float4& translation, const Quaternion& orientation)
+{
+	return qtRotate( qtInvert( orientation ), p-translation ); // use qtInvRotate
+}
+
+};
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScan.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScan.h
@@ -0,0 +1,73 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+
+namespace adl
+{
+
+class PrefixScanBase
+{
+	public:
+		enum Option
+		{
+			INCLUSIVE, 
+			EXCLUSIVE
+		};
+};
+
+
+template<DeviceType TYPE>
+class PrefixScan : public PrefixScanBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		enum
+		{
+			BLOCK_SIZE = 128
+		};
+
+		struct Data
+		{
+			Option m_option;
+			const Device* m_device;
+			Kernel* m_localScanKernel;
+			Kernel* m_blockSumKernel;
+			Kernel* m_propagationKernel;
+			Buffer<u32>* m_workBuffer;
+			Buffer<int4>* m_constBuffer[3];// todo. dx need one for each
+			int m_maxSize;
+		};
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize, Option option = EXCLUSIVE);
+
+		static
+		void deallocate(Data* data);
+
+		static
+		void execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, u32* sum = 0);
+};
+
+
+
+#include <AdlPrimitives/Scan/PrefixScanHost.inl>
+#include <AdlPrimitives/Scan/PrefixScan.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScan.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScan.inl
@@ -0,0 +1,125 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Scan\\PrefixScanKernels"
+#define KERNEL0 "LocalScanKernel"
+#define KERNEL1 "TopLevelScanKernel"
+#define KERNEL2 "AddOffsetKernel"
+
+#include <AdlPrimitives/Scan/PrefixScanKernelsCL.h>
+#include <AdlPrimitives/Scan/PrefixScanKernelsDX11.h>
+
+template<DeviceType TYPE>
+typename PrefixScan<TYPE>::Data* PrefixScan<TYPE>::allocate(const Device* device, int maxSize, Option option)
+{
+	ADLASSERT( TYPE == device->m_type );
+
+	ADLASSERT( maxSize <= BLOCK_SIZE*2*2048 );
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{prefixScanKernelsCL, prefixScanKernelsDX11};
+#else
+		{0,0};
+#endif
+	Data* data = new Data;
+	data->m_device = device;
+	data->m_localScanKernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
+	data->m_blockSumKernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
+	data->m_propagationKernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
+
+	int bufSize = (NEXTMULTIPLEOF( max2( maxSize/BLOCK_SIZE, (int)BLOCK_SIZE ), BLOCK_SIZE )+1);
+	data->m_workBuffer = new Buffer<u32>( device, bufSize );
+	data->m_constBuffer[0] = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
+	data->m_constBuffer[1] = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
+	data->m_constBuffer[2] = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
+
+	data->m_maxSize = maxSize;
+	data->m_option = option;
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void PrefixScan<TYPE>::deallocate(Data* data)
+{
+	delete data->m_workBuffer;
+	delete data->m_constBuffer[0];
+	delete data->m_constBuffer[1];
+	delete data->m_constBuffer[2];
+	delete data;
+}
+
+template<DeviceType TYPE>
+void PrefixScan<TYPE>::execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, u32* sum)
+{
+	ADLASSERT( data );
+	ADLASSERT( n <= data->m_maxSize );
+	ADLASSERT( data->m_option == EXCLUSIVE );
+	const u32 numBlocks = u32( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
+
+
+	int4 constBuffer;
+	constBuffer.x = n;
+	constBuffer.y = numBlocks;
+	constBuffer.z = (int)nextPowerOf2( numBlocks );
+
+	Buffer<u32>* srcNative = BufferUtils::map<TYPE, true>( data->m_device, &src );
+	Buffer<u32>* dstNative = BufferUtils::map<TYPE, false>( data->m_device, &dst );
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( dstNative ), BufferInfo( srcNative ), BufferInfo( data->m_workBuffer ) };
+
+		Launcher launcher( data->m_device, data->m_localScanKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer[0], constBuffer );
+		launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
+	}
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( data->m_workBuffer ) };
+
+		Launcher launcher( data->m_device, data->m_blockSumKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer[1], constBuffer );
+		launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
+	}
+	
+
+	if( numBlocks > 1 )
+	{
+		BufferInfo bInfo[] = { BufferInfo( dstNative ), BufferInfo( data->m_workBuffer ) };
+		Launcher launcher( data->m_device, data->m_propagationKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer[2], constBuffer );
+		launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
+	}
+
+	DeviceUtils::waitForCompletion( data->m_device );
+	if( sum )
+	{
+		dstNative->read( sum, 1, n-1);
+	}
+	DeviceUtils::waitForCompletion( data->m_device );
+
+	BufferUtils::unmap<false>( srcNative, &src );
+	BufferUtils::unmap<true>( dstNative, &dst );
+}
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanHost.inl
@@ -0,0 +1,74 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+template<>
+class PrefixScan<TYPE_HOST> : public PrefixScanBase
+{
+	public:
+		struct Data
+		{
+			Option m_option;
+		};
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize, Option option = EXCLUSIVE)
+		{
+			ADLASSERT( deviceData->m_type == TYPE_HOST );
+
+			Data* data = new Data;
+			data->m_option = option;
+			return data;
+		}
+
+		static
+		void deallocate(Data* data)
+		{
+			delete data;
+		}
+
+		static
+		void execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, u32* sum = 0)
+		{
+			ADLASSERT( src.getType() == TYPE_HOST && dst.getType() == TYPE_HOST );
+			HostBuffer<u32>& hSrc = (HostBuffer<u32>&)src;
+			HostBuffer<u32>& hDst = (HostBuffer<u32>&)dst;
+
+			u32 s = 0;
+			if( data->m_option == EXCLUSIVE )
+			{
+				for(int i=0; i<n; i++)
+				{
+					hDst[i] = s;
+					s += hSrc[i];
+				}
+			}
+			else
+			{
+				for(int i=0; i<n; i++)
+				{
+					s += hSrc[i];
+					hDst[i] = s;
+				}
+			}
+
+			if( sum )
+			{
+				*sum = hDst[n-1];
+			}
+		}
+
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernels.cl
@@ -0,0 +1,153 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+
+// takahiro end
+#define WG_SIZE 128
+
+typedef struct
+{
+	uint m_numElems;
+	uint m_numBlocks;
+	uint m_numScanBlocks;
+	uint m_padding[1];
+} ConstBuffer;
+
+
+u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)
+{
+	u32 blocksum;
+    int offset = 1;
+    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
+    {
+        GROUP_LDS_BARRIER;
+        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
+        {
+            int ai = offset*(2*iIdx+1)-1;
+            int bi = offset*(2*iIdx+2)-1;
+            data[bi] += data[ai];
+        }
+	}
+
+    GROUP_LDS_BARRIER;
+
+    if( lIdx == 0 )
+	{
+		blocksum = data[ n-1 ];
+        data[ n-1 ] = 0;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	offset >>= 1;
+    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
+    {
+        GROUP_LDS_BARRIER;
+        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
+        {
+            int ai = offset*(2*iIdx+1)-1;
+            int bi = offset*(2*iIdx+2)-1;
+            u32 temp = data[ai];
+            data[ai] = data[bi];
+            data[bi] += temp;
+        }
+	}
+	GROUP_LDS_BARRIER;
+
+	return blocksum;
+}
+
+
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+__kernel
+void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,
+		ConstBuffer cb)
+{
+	__local u32 ldsData[WG_SIZE*2];
+
+	int gIdx = GET_GLOBAL_IDX;
+	int lIdx = GET_LOCAL_IDX;
+
+	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;
+	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;
+
+	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
+
+	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;
+
+	if( (2*gIdx) < cb.m_numElems )
+    {
+        dst[2*gIdx]     = ldsData[2*lIdx];
+	}
+	if( (2*gIdx + 1) < cb.m_numElems )
+	{
+        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
+    }
+}
+
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+__kernel
+void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, ConstBuffer cb)
+{
+	const u32 blockSize = WG_SIZE*2;
+
+	int myIdx = GET_GROUP_IDX+1;
+	int lIdx = GET_LOCAL_IDX;
+
+	u32 iBlockSum = blockSum[myIdx];
+
+	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);
+	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)
+	{
+		dst[i] += iBlockSum;
+	}
+}
+
+
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+__kernel
+void TopLevelScanKernel(__global u32* dst, ConstBuffer cb)
+{
+	__local u32 ldsData[2048];
+	int gIdx = GET_GLOBAL_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	int lSize = GET_GROUP_SIZE;
+
+	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )
+	{
+		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
+
+	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )
+	{
+		dst[i] = ldsData[i];
+	}
+
+	if( gIdx == 0 )
+	{
+		dst[cb.m_numBlocks] = sum;
+	}
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernels.hlsl
@@ -0,0 +1,157 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+
+//	takahiro end
+#define WG_SIZE 128
+
+#define GET_GROUP_SIZE WG_SIZE
+
+
+cbuffer SortCB : register( b0 )
+{
+	int m_numElems;
+	int m_numBlocks;
+	int m_numScanBlocks;
+};
+ 
+RWStructuredBuffer<uint> dst : register( u0 );
+RWStructuredBuffer<uint> src : register( u1 );
+RWStructuredBuffer<uint> sumBuffer : register( u2 );
+
+
+groupshared u32 ldsData[2048];
+
+u32 ScanExclusive(u32 n, int lIdx, int lSize)
+{
+	u32 blocksum;
+    int offset = 1;
+    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)
+    {
+        GROUP_LDS_BARRIER;
+        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)
+        {
+            int ai = offset*(2*iIdx+1)-1;
+            int bi = offset*(2*iIdx+2)-1;
+            ldsData[bi] += ldsData[ai];
+        }
+	}
+
+    GROUP_LDS_BARRIER;
+
+    if( lIdx == 0 )
+	{
+		blocksum = ldsData[ n-1 ];
+        ldsData[ n-1 ] = 0;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	offset >>= 1;
+    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )
+    {
+        GROUP_LDS_BARRIER;
+        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )
+        {
+            int ai = offset*(2*iIdx+1)-1;
+            int bi = offset*(2*iIdx+2)-1;
+            u32 temp = ldsData[ai];
+            ldsData[ai] = ldsData[bi];
+            ldsData[bi] += temp;
+        }
+	}
+	GROUP_LDS_BARRIER;
+
+	return blocksum;
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void LocalScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)
+{
+	int gIdx = GET_GLOBAL_IDX;
+	int lIdx = GET_LOCAL_IDX;
+
+	ldsData[2*lIdx]     = ( 2*gIdx < m_numElems )? src[2*gIdx]: 0;
+	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < m_numElems )? src[2*gIdx + 1]: 0;
+
+	u32 sum = ScanExclusive(WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);
+
+	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;
+
+	if( (2*gIdx) < m_numElems )
+    {
+        dst[2*gIdx]     = ldsData[2*lIdx];
+	}
+	if( (2*gIdx + 1) < m_numElems )
+	{
+        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];
+    }
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void TopLevelScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)
+{
+	int gIdx = GET_GLOBAL_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	int lSize = GET_GROUP_SIZE;
+
+	for(int i=lIdx; i<m_numScanBlocks; i+=lSize )
+	{
+		ldsData[i] = (i<m_numBlocks)? dst[i]:0;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	u32 sum = ScanExclusive(m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);
+
+	for(int i=lIdx; i<m_numBlocks; i+=lSize )
+	{
+		dst[i] = ldsData[i];
+	}
+
+	if( gIdx == 0 )
+	{
+		dst[m_numBlocks] = sum;
+	}
+}
+
+
+ 
+RWStructuredBuffer<uint> blockSum2 : register( u1 );
+
+[numthreads(WG_SIZE, 1, 1)]
+void AddOffsetKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)
+{
+	const u32 blockSize = WG_SIZE*2;
+
+	int myIdx = GET_GROUP_IDX+1;
+	int llIdx = GET_LOCAL_IDX;
+
+	u32 iBlockSum = blockSum2[myIdx];
+
+	int endValue = min((myIdx+1)*(blockSize), m_numElems);
+	for(int i=myIdx*blockSize+llIdx; i<endValue; i+=GET_GROUP_SIZE)
+	{
+		dst[i] += iBlockSum;
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernelsCL.h
@@ -0,0 +1,143 @@
+static const char* prefixScanKernelsCL= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"\n"
+"// takahiro end\n"
+"#define WG_SIZE 128\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	uint m_numElems;\n"
+"	uint m_numBlocks;\n"
+"	uint m_numScanBlocks;\n"
+"	uint m_padding[1];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
+"{\n"
+"	u32 blocksum;\n"
+"    int offset = 1;\n"
+"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
+"    {\n"
+"        GROUP_LDS_BARRIER;\n"
+"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
+"        {\n"
+"            int ai = offset*(2*iIdx+1)-1;\n"
+"            int bi = offset*(2*iIdx+2)-1;\n"
+"            data[bi] += data[ai];\n"
+"        }\n"
+"	}\n"
+"\n"
+"    GROUP_LDS_BARRIER;\n"
+"\n"
+"    if( lIdx == 0 )\n"
+"	{\n"
+"		blocksum = data[ n-1 ];\n"
+"        data[ n-1 ] = 0;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	offset >>= 1;\n"
+"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
+"    {\n"
+"        GROUP_LDS_BARRIER;\n"
+"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
+"        {\n"
+"            int ai = offset*(2*iIdx+1)-1;\n"
+"            int bi = offset*(2*iIdx+2)-1;\n"
+"            u32 temp = data[ai];\n"
+"            data[ai] = data[bi];\n"
+"            data[bi] += temp;\n"
+"        }\n"
+"	}\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	return blocksum;\n"
+"}\n"
+"\n"
+"\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"__kernel\n"
+"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
+"		ConstBuffer cb)\n"
+"{\n"
+"	__local u32 ldsData[WG_SIZE*2];\n"
+"\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"\n"
+"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
+"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
+"\n"
+"	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+"\n"
+"	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
+"\n"
+"	if( (2*gIdx) < cb.m_numElems )\n"
+"    {\n"
+"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
+"	}\n"
+"	if( (2*gIdx + 1) < cb.m_numElems )\n"
+"	{\n"
+"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
+"    }\n"
+"}\n"
+"\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"__kernel\n"
+"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, ConstBuffer cb)\n"
+"{\n"
+"	const u32 blockSize = WG_SIZE*2;\n"
+"\n"
+"	int myIdx = GET_GROUP_IDX+1;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"\n"
+"	u32 iBlockSum = blockSum[myIdx];\n"
+"\n"
+"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
+"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
+"	{\n"
+"		dst[i] += iBlockSum;\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"__kernel\n"
+"void TopLevelScanKernel(__global u32* dst, ConstBuffer cb)\n"
+"{\n"
+"	__local u32 ldsData[2048];\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int lSize = GET_GROUP_SIZE;\n"
+"\n"
+"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
+"	{\n"
+"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+"\n"
+"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
+"	{\n"
+"		dst[i] = ldsData[i];\n"
+"	}\n"
+"\n"
+"	if( gIdx == 0 )\n"
+"	{\n"
+"		dst[cb.m_numBlocks] = sum;\n"
+"	}\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Scan/PrefixScanKernelsDX11.h
@@ -0,0 +1,147 @@
+static const char* prefixScanKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"\n"
+"//	takahiro end\n"
+"#define WG_SIZE 128\n"
+"\n"
+"#define GET_GROUP_SIZE WG_SIZE\n"
+"\n"
+"\n"
+"cbuffer SortCB : register( b0 )\n"
+"{\n"
+"	int m_numElems;\n"
+"	int m_numBlocks;\n"
+"	int m_numScanBlocks;\n"
+"};\n"
+" \n"
+"RWStructuredBuffer<uint> dst : register( u0 );\n"
+"RWStructuredBuffer<uint> src : register( u1 );\n"
+"RWStructuredBuffer<uint> sumBuffer : register( u2 );\n"
+"\n"
+"\n"
+"groupshared u32 ldsData[2048];\n"
+"\n"
+"u32 ScanExclusive(u32 n, int lIdx, int lSize)\n"
+"{\n"
+"	u32 blocksum;\n"
+"    int offset = 1;\n"
+"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
+"    {\n"
+"        GROUP_LDS_BARRIER;\n"
+"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
+"        {\n"
+"            int ai = offset*(2*iIdx+1)-1;\n"
+"            int bi = offset*(2*iIdx+2)-1;\n"
+"            ldsData[bi] += ldsData[ai];\n"
+"        }\n"
+"	}\n"
+"\n"
+"    GROUP_LDS_BARRIER;\n"
+"\n"
+"    if( lIdx == 0 )\n"
+"	{\n"
+"		blocksum = ldsData[ n-1 ];\n"
+"        ldsData[ n-1 ] = 0;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	offset >>= 1;\n"
+"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
+"    {\n"
+"        GROUP_LDS_BARRIER;\n"
+"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
+"        {\n"
+"            int ai = offset*(2*iIdx+1)-1;\n"
+"            int bi = offset*(2*iIdx+2)-1;\n"
+"            u32 temp = ldsData[ai];\n"
+"            ldsData[ai] = ldsData[bi];\n"
+"            ldsData[bi] += temp;\n"
+"        }\n"
+"	}\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	return blocksum;\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void LocalScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"\n"
+"	ldsData[2*lIdx]     = ( 2*gIdx < m_numElems )? src[2*gIdx]: 0;\n"
+"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < m_numElems )? src[2*gIdx + 1]: 0;\n"
+"\n"
+"	u32 sum = ScanExclusive(WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+"\n"
+"	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
+"\n"
+"	if( (2*gIdx) < m_numElems )\n"
+"    {\n"
+"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
+"	}\n"
+"	if( (2*gIdx + 1) < m_numElems )\n"
+"	{\n"
+"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
+"    }\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void TopLevelScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int lSize = GET_GROUP_SIZE;\n"
+"\n"
+"	for(int i=lIdx; i<m_numScanBlocks; i+=lSize )\n"
+"	{\n"
+"		ldsData[i] = (i<m_numBlocks)? dst[i]:0;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	u32 sum = ScanExclusive(m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+"\n"
+"	for(int i=lIdx; i<m_numBlocks; i+=lSize )\n"
+"	{\n"
+"		dst[i] = ldsData[i];\n"
+"	}\n"
+"\n"
+"	if( gIdx == 0 )\n"
+"	{\n"
+"		dst[m_numBlocks] = sum;\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+" \n"
+"RWStructuredBuffer<uint> blockSum2 : register( u1 );\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void AddOffsetKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
+"{\n"
+"	const u32 blockSize = WG_SIZE*2;\n"
+"\n"
+"	int myIdx = GET_GROUP_IDX+1;\n"
+"	int llIdx = GET_LOCAL_IDX;\n"
+"\n"
+"	u32 iBlockSum = blockSum2[myIdx];\n"
+"\n"
+"	int endValue = min((myIdx+1)*(blockSize), m_numElems);\n"
+"	for(int i=myIdx*blockSize+llIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
+"	{\n"
+"		dst[i] += iBlockSum;\n"
+"	}\n"
+"}\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearch.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearch.h
@@ -0,0 +1,73 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+#include <AdlPrimitives/Sort/SortData.h>
+#include <AdlPrimitives/Fill/Fill.h>
+
+namespace adl
+{
+
+class BoundSearchBase
+{
+	public:
+		enum Option
+		{
+			BOUND_LOWER,
+			BOUND_UPPER,
+			COUNT,
+		};
+};
+
+template<DeviceType TYPE>
+class BoundSearch : public BoundSearchBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			const Device* m_device;
+			Kernel* m_lowerSortDataKernel;
+			Kernel* m_upperSortDataKernel;
+			Kernel* m_subtractKernel;
+			Buffer<int4>* m_constBuffer;
+			Buffer<u32>* m_lower;
+			Buffer<u32>* m_upper;
+			typename Fill<TYPE>::Data* m_fillData;
+		};
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize = 0);
+
+		static
+		void deallocate(Data* data);
+
+		//	src has to be src[i].m_key <= src[i+1].m_key
+		static
+		void execute(Data* data, Buffer<SortData>& src, u32 nSrc, Buffer<u32>& dst, u32 nDst, Option option = BOUND_LOWER );
+
+//		static
+//		void execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, Option option = );
+};
+
+#include <AdlPrimitives/Search/BoundSearchHost.inl>
+#include <AdlPrimitives/Search/BoundSearch.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearch.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearch.inl
@@ -0,0 +1,128 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Search\\BoundSearchKernels"
+#define KERNEL0 "SearchSortDataLowerKernel"
+#define KERNEL1 "SearchSortDataUpperKernel"
+#define KERNEL2 "SubtractKernel"
+
+#include <AdlPrimitives/Search/BoundSearchKernelsCL.h>
+#include <AdlPrimitives/Search/BoundSearchKernelsDX11.h>
+
+template<DeviceType TYPE>
+typename BoundSearch<TYPE>::Data* BoundSearch<TYPE>::allocate(const Device* device, int maxSize)
+{
+	ADLASSERT( TYPE == device->m_type );
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{boundSearchKernelsCL, boundSearchKernelsDX11};
+#else
+		{0,0};
+#endif
+
+	Data* data = new Data;
+
+	data->m_device = device;
+	data->m_lowerSortDataKernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
+	data->m_upperSortDataKernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
+	data->m_constBuffer = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
+	if( maxSize )
+	{
+		data->m_subtractKernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
+	}
+	data->m_lower = (maxSize == 0)? 0: new Buffer<u32>( device, maxSize );
+	data->m_upper = (maxSize == 0)? 0: new Buffer<u32>( device, maxSize );
+	data->m_fillData = (maxSize == 0)? 0: Fill<TYPE>::allocate( device );
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void BoundSearch<TYPE>::deallocate(Data* data)
+{
+	delete data->m_constBuffer;
+	if( data->m_lower ) delete data->m_lower;
+	if( data->m_upper ) delete data->m_upper;
+	if( data->m_fillData ) Fill<TYPE>::deallocate( data->m_fillData );
+	delete data;
+}
+
+template<DeviceType TYPE>
+void BoundSearch<TYPE>::execute(Data* data, Buffer<SortData>& src, u32 nSrc, Buffer<u32>& dst, u32 nDst, Option option )
+{
+	int4 constBuffer;
+	constBuffer.x = nSrc;
+	constBuffer.y = nDst;
+
+	Buffer<SortData>* srcNative = BufferUtils::map<TYPE, true>( data->m_device, &src );
+	Buffer<u32>* dstNative = BufferUtils::map<TYPE, false>( data->m_device, &dst );
+
+	if( option == BOUND_LOWER )
+	{
+		BufferInfo bInfo[] = { BufferInfo( srcNative, true ), BufferInfo( dstNative ) };
+
+		Launcher launcher( data->m_device, data->m_lowerSortDataKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer, constBuffer );
+		launcher.launch1D( nSrc, 64 );
+	}
+	else if( option == BOUND_UPPER )
+	{
+		BufferInfo bInfo[] = { BufferInfo( srcNative, true ), BufferInfo( dstNative ) };
+
+		Launcher launcher( data->m_device, data->m_upperSortDataKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer, constBuffer );
+		launcher.launch1D( nSrc+1, 64 );
+	}
+	else if( option == COUNT )
+	{
+		ADLASSERT( data->m_lower );
+		ADLASSERT( data->m_upper );
+		ADLASSERT( data->m_lower->getSize() <= (int)nDst );
+		ADLASSERT( data->m_upper->getSize() <= (int)nDst );
+
+		int zero = 0;
+		Fill<TYPE>::execute( data->m_fillData, (Buffer<int>&)*data->m_lower, zero, nDst );
+		Fill<TYPE>::execute( data->m_fillData, (Buffer<int>&)*data->m_upper, zero, nDst );
+
+		execute( data, src, nSrc, *data->m_lower, nDst, BOUND_LOWER );
+		execute( data, src, nSrc, *data->m_upper, nDst, BOUND_UPPER );
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( data->m_upper, true ), BufferInfo( data->m_lower, true ), BufferInfo( dstNative ) };
+
+			Launcher launcher( data->m_device, data->m_subtractKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( nDst, 64 );
+		}
+	}
+	else
+	{
+		ADLASSERT( 0 );
+	}
+
+	BufferUtils::unmap<false>( srcNative, &src );
+	BufferUtils::unmap<true>( dstNative, &dst );
+}
+
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchHost.inl
@@ -0,0 +1,111 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+template<>
+class BoundSearch<TYPE_HOST> : public BoundSearchBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			const Device* m_device;
+		};
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize = 0)
+		{
+			ADLASSERT( deviceData->m_type == TYPE_HOST );
+			Data* data = new Data;
+			data->m_device = deviceData;
+			return data;
+		}
+
+		static
+		void deallocate(Data* data)
+		{
+			delete data;
+		}
+
+		static
+		void execute(Data* data, Buffer<SortData>& rawSrc, u32 nSrc, Buffer<u32>& rawDst, u32 nDst, Option option = BOUND_LOWER)
+		{
+			ADLASSERT( rawSrc.getType() == TYPE_HOST );
+			ADLASSERT( rawDst.getType() == TYPE_HOST );
+
+			HostBuffer<SortData>& src = *(HostBuffer<SortData>*)&rawSrc;
+			HostBuffer<u32>& dst = *(HostBuffer<u32>*)&rawDst;
+
+			for(int i=0; i<nSrc-1; i++) 
+				ADLASSERT( src[i].m_key <= src[i+1].m_key );
+
+			if( option == BOUND_LOWER )
+			{
+				for(u32 i=0; i<nSrc; i++)
+				{
+					SortData& iData = (i==0)? SortData(-1,-1): src[i-1];
+					SortData& jData = (i==nSrc)? SortData(nDst, nDst): src[i];
+
+					if( iData.m_key != jData.m_key )
+					{
+//						for(u32 k=iData.m_key+1; k<=min(jData.m_key,nDst-1); k++)
+						u32 k = jData.m_key;
+						{
+							dst[k] = i;
+						}
+					}
+				}
+			}
+			else if( option == BOUND_UPPER )
+			{
+				for(u32 i=0; i<nSrc+1; i++)
+				{
+					SortData& iData = (i==0)? SortData(0,0): src[i-1];
+					SortData& jData = (i==nSrc)? SortData(nDst, nDst): src[i];
+
+					if( iData.m_key != jData.m_key )
+					{
+//						for(u32 k=iData.m_key; k<min(jData.m_key,nDst); k++)
+						u32 k = iData.m_key;
+						{
+							dst[k] = i;
+						}
+					}
+				}
+			}
+			else if( option == COUNT )
+			{
+				HostBuffer<u32> lower( data->m_device, nDst );
+				HostBuffer<u32> upper( data->m_device, nDst );
+
+				for(u32 i=0; i<nDst; i++) { lower[i] = upper[i] = 0; }
+
+				execute( data, rawSrc, nSrc, lower, nDst, BOUND_LOWER );
+				execute( data, rawSrc, nSrc, upper, nDst, BOUND_UPPER );
+
+				for(u32 i=0; i<nDst; i++) { dst[i] = upper[i] - lower[i]; }
+			}
+			else
+			{
+				ADLASSERT( 0 );
+			}
+		}
+
+//		static
+//		void execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, Option option = );
+};
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernels.cl
@@ -0,0 +1,112 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+
+
+typedef struct
+{
+	u32 m_nSrc;
+	u32 m_nDst;
+	u32 m_padding[2];
+} ConstBuffer;
+
+
+
+__attribute__((reqd_work_group_size(64,1,1)))
+__kernel
+void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = cb.m_nSrc;
+	u32 nDst = cb.m_nDst;
+
+	if( gIdx < nSrc )
+	{
+		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);
+		SortData end; end.m_key = nDst; end.m_value = nDst;
+
+		SortData iData = (gIdx==0)? first: src[gIdx-1];
+		SortData jData = (gIdx==nSrc)? end: src[gIdx];
+
+		if( iData.m_key != jData.m_key )
+		{
+//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
+			u32 k = jData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+}
+
+
+__attribute__((reqd_work_group_size(64,1,1)))
+__kernel
+void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = cb.m_nSrc;
+	u32 nDst = cb.m_nDst;
+
+	if( gIdx < nSrc+1 )
+	{
+		SortData first; first.m_key = 0; first.m_value = 0;
+		SortData end; end.m_key = nDst; end.m_value = nDst;
+
+		SortData iData = (gIdx==0)? first: src[gIdx-1];
+		SortData jData = (gIdx==nSrc)? end: src[gIdx];
+
+		if( iData.m_key != jData.m_key )
+		{
+//			for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)
+			u32 k = iData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+}
+
+__attribute__((reqd_work_group_size(64,1,1)))
+__kernel
+void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = cb.m_nSrc;
+	u32 nDst = cb.m_nDst;
+
+	if( gIdx < nDst )
+	{
+		C[gIdx] = A[gIdx] - B[gIdx];
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernels.hlsl
@@ -0,0 +1,104 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
+#define AtomInc(x) InterlockedAdd(x, 1)
+#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
+
+
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+
+
+cbuffer SortCB : register( b0 )
+{
+	u32 m_nSrc;
+	u32 m_nDst;
+	u32 m_padding[2];
+};
+
+
+StructuredBuffer<SortData> src : register( t0 );
+RWStructuredBuffer<u32> dst : register( u0 );
+
+
+[numthreads(64, 1, 1)]
+void SearchSortDataLowerKernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = m_nSrc;
+	u32 nDst = m_nDst;
+
+	if( gIdx < nSrc )
+	{
+		SortData iData;
+		SortData jData;
+		if( gIdx==0 ) iData.m_key = iData.m_value = (u32)-1;
+		else iData = src[gIdx-1];
+
+		if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;
+		else jData = src[gIdx];
+
+		if( iData.m_key != jData.m_key )
+		{
+//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
+			u32 k = jData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+}
+
+[numthreads(64, 1, 1)]
+void SearchSortDataUpperKernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = m_nSrc;
+	u32 nDst = m_nDst;
+
+	if( gIdx < nSrc+1 )
+	{
+		SortData iData;
+		SortData jData;
+		if( gIdx==0 ) iData.m_key = iData.m_value = 0;
+		else iData = src[gIdx-1];
+
+		if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;
+		else jData = src[gIdx];
+
+		if( iData.m_key != jData.m_key )
+		{
+//			for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)
+			u32 k = iData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernelsCL.h
@@ -0,0 +1,102 @@
+static const char* boundSearchKernelsCL= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_nSrc;\n"
+"	u32 m_nDst;\n"
+"	u32 m_padding[2];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"__kernel\n"
+"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = cb.m_nSrc;\n"
+"	u32 nDst = cb.m_nDst;\n"
+"\n"
+"	if( gIdx < nSrc )\n"
+"	{\n"
+"		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
+"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+"\n"
+"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
+"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+"\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
+"			u32 k = jData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"__kernel\n"
+"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = cb.m_nSrc;\n"
+"	u32 nDst = cb.m_nDst;\n"
+"\n"
+"	if( gIdx < nSrc+1 )\n"
+"	{\n"
+"		SortData first; first.m_key = 0; first.m_value = 0;\n"
+"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+"\n"
+"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
+"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+"\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"//			for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)\n"
+"			u32 k = iData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"__kernel\n"
+"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = cb.m_nSrc;\n"
+"	u32 nDst = cb.m_nDst;\n"
+"\n"
+"	if( gIdx < nDst )\n"
+"	{\n"
+"		C[gIdx] = A[gIdx] - B[gIdx];\n"
+"	}\n"
+"}\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernelsDX11.h
@@ -0,0 +1,94 @@
+static const char* boundSearchKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+"\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"\n"
+"\n"
+"cbuffer SortCB : register( b0 )\n"
+"{\n"
+"	u32 m_nSrc;\n"
+"	u32 m_nDst;\n"
+"	u32 m_padding[2];\n"
+"};\n"
+"\n"
+"\n"
+"StructuredBuffer<SortData> src : register( t0 );\n"
+"RWStructuredBuffer<u32> dst : register( u0 );\n"
+"\n"
+"\n"
+"[numthreads(64, 1, 1)]\n"
+"void SearchSortDataLowerKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = m_nSrc;\n"
+"	u32 nDst = m_nDst;\n"
+"\n"
+"	if( gIdx < nSrc )\n"
+"	{\n"
+"		SortData iData;\n"
+"		SortData jData;\n"
+"		if( gIdx==0 ) iData.m_key = iData.m_value = (u32)-1;\n"
+"		else iData = src[gIdx-1];\n"
+"\n"
+"		if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;\n"
+"		else jData = src[gIdx];\n"
+"\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
+"			u32 k = jData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"[numthreads(64, 1, 1)]\n"
+"void SearchSortDataUpperKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = m_nSrc;\n"
+"	u32 nDst = m_nDst;\n"
+"\n"
+"	if( gIdx < nSrc+1 )\n"
+"	{\n"
+"		SortData iData;\n"
+"		SortData jData;\n"
+"		if( gIdx==0 ) iData.m_key = iData.m_value = 0;\n"
+"		else iData = src[gIdx-1];\n"
+"\n"
+"		if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;\n"
+"		else jData = src[gIdx];\n"
+"\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"//			for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)\n"
+"			u32 k = iData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort.h
@@ -0,0 +1,53 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+#include <AdlPrimitives/Sort/SortData.h>
+#include <AdlPrimitives/Scan/PrefixScan.h>
+
+namespace adl
+{
+
+class RadixSortBase
+{
+	public:
+		enum Option
+		{
+			SORT_SIMPLE,
+			SORT_STANDARD, 
+			SORT_ADVANCED
+		};
+};
+
+template<DeviceType TYPE>
+class RadixSort : public RadixSortBase
+{
+	public:
+		struct Data
+		{
+			Option m_option;
+			const Device* m_deviceData;
+			typename PrefixScan<TYPE>::Data* m_scanData;
+			int m_maxSize;
+		};
+		
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_STANDARD);
+
+		static
+		void deallocate(Data* data);
+
+		static
+		void execute(Data* data, Buffer<SortData>& inout, int n, int sortBits = 32);
+};
+
+
+#include <AdlPrimitives/Sort/RadixSort.inl>
+#include <AdlPrimitives/Sort/RadixSortHost.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort.inl
@@ -0,0 +1,58 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#include <AdlPrimitives/Sort/RadixSortSimple.inl>
+#include <AdlPrimitives/Sort/RadixSortStandard.inl>
+#include <AdlPrimitives/Sort/RadixSortAdvanced.inl>
+
+
+#define DISPATCH_IMPL(x) \
+	switch( data->m_option ) \
+	{ \
+		case SORT_SIMPLE: RadixSortSimple<TYPE>::x; break; \
+		case SORT_STANDARD: RadixSortStandard<TYPE>::x; break; \
+		case SORT_ADVANCED: RadixSortAdvanced<TYPE>::x; break; \
+		default:ADLASSERT(0);break; \
+	}
+
+template<DeviceType TYPE>
+typename RadixSort<TYPE>::Data* RadixSort<TYPE>::allocate(const Device* deviceData, int maxSize, Option option)
+{
+	ADLASSERT( TYPE == deviceData->m_type );
+
+	void* dataOut;
+	switch( option )
+	{
+	case SORT_SIMPLE:
+		dataOut = RadixSortSimple<TYPE>::allocate( deviceData, maxSize, option );
+		break;
+	case SORT_STANDARD:
+		dataOut = RadixSortStandard<TYPE>::allocate( deviceData, maxSize, option );
+		break;
+	case SORT_ADVANCED:
+		dataOut = RadixSortAdvanced<TYPE>::allocate( deviceData, maxSize, option );
+		break;
+	default:
+		ADLASSERT(0);
+		break;
+	}
+	return (typename RadixSort<TYPE>::Data*)dataOut;
+}
+
+template<DeviceType TYPE>
+void RadixSort<TYPE>::deallocate(Data* data)
+{
+	DISPATCH_IMPL( deallocate( data ) );
+}
+
+template<DeviceType TYPE>
+void RadixSort<TYPE>::execute(Data* data, Buffer<SortData>& inout, int n, int sortBits)
+{
+	DISPATCH_IMPL( execute( data, inout, n, sortBits ) );
+}
+
+
+#undef DISPATCH_IMPL
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32.h
@@ -0,0 +1,98 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+#include <AdlPrimitives/Copy/Copy.h>
+#include <AdlPrimitives/Sort/SortData.h>
+
+namespace adl
+{
+
+class RadixSort32Base
+{
+	public:
+// 		enum Option
+// 		{
+// 			SORT_SIMPLE,
+// 			SORT_STANDARD, 
+// 			SORT_ADVANCED
+// 		};
+};
+
+template<DeviceType TYPE>
+class RadixSort32 : public RadixSort32Base
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		enum
+		{
+			DATA_ALIGNMENT = 256,
+			WG_SIZE = 64,
+			ELEMENTS_PER_WORK_ITEM = (256/WG_SIZE),
+			BITS_PER_PASS = 4,
+
+			//	if you change this, change nPerWI in kernel as well
+			NUM_WGS = 20*6,	//	cypress
+//			NUM_WGS = 24*6,	//	cayman
+//			NUM_WGS = 32*4,	//	nv
+		};
+
+		struct ConstData
+		{
+			int m_n;
+			int m_nWGs;
+			int m_startBit;
+			int m_nBlocksPerWG;
+		};
+
+		struct Data
+		{
+			const Device* m_device;
+			int m_maxSize;
+
+			Kernel* m_streamCountKernel;
+			Kernel* m_streamCountSortDataKernel;
+			Kernel* m_prefixScanKernel;
+			Kernel* m_sortAndScatterKernel;
+			Kernel* m_sortAndScatterKeyValueKernel;
+			Kernel* m_sortAndScatterSortDataKernel;
+
+			Buffer<u32>* m_workBuffer0;
+			Buffer<u32>* m_workBuffer1;
+			Buffer<u32>* m_workBuffer2;
+			Buffer<SortData>* m_workBuffer3;
+
+			Buffer<ConstData>* m_constBuffer[32/BITS_PER_PASS];
+
+			typename Copy<TYPE>::Data* m_copyData;
+		};
+
+		static
+		Data* allocate(const Device* device, int maxSize);
+
+		static
+		void deallocate(Data* data);
+
+		static
+		void execute(Data* data, Buffer<u32>& inout, int n, int sortBits = 32);
+
+		static
+		void execute(Data* data, Buffer<u32>& in, Buffer<u32>& out, int n, int sortBits = 32);
+
+		static
+		void execute(Data* data, Buffer<u32>& keysIn, Buffer<u32>& keysOut, Buffer<u32>& valuesIn, Buffer<u32>& valuesOut, int n, int sortBits = 32);
+		
+		static
+		void execute(Data* data, Buffer<SortData>& keyValuesInOut, int n, int sortBits = 32 );
+};
+
+
+#include <AdlPrimitives/Sort/RadixSort32Host.inl>
+#include <AdlPrimitives/Sort/RadixSort32.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32.inl
@@ -0,0 +1,346 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Sort\\RadixSort32Kernels"
+#define RADIXSORT32_KERNEL0 "StreamCountKernel"
+#define RADIXSORT32_KERNEL1 "PrefixScanKernel"
+#define RADIXSORT32_KERNEL2 "SortAndScatterKernel"
+#define RADIXSORT32_KERNEL3 "SortAndScatterKeyValueKernel"
+#define RADIXSORT32_KERNEL4 "SortAndScatterSortDataKernel"
+#define RADIXSORT32_KERNEL5 "StreamCountSortDataKernel"
+
+#include "RadixSort32KernelsCL.h"
+#include "RadixSort32KernelsDX11.h"
+
+//	todo. Shader compiler (2010JuneSDK) doesn't allow me to place Barriers in SortAndScatterKernel... 
+//	So it only works on a GPU with 64 wide SIMD. 
+
+template<DeviceType TYPE>
+typename RadixSort32<TYPE>::Data* RadixSort32<TYPE>::allocate( const Device* device, int maxSize )
+{
+	ADLASSERT( TYPE == device->m_type );
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+	{radixSort32KernelsCL, radixSort32KernelsDX11};
+#else
+	{0,0};
+#endif
+	
+	Data* data = new Data;
+	data->m_device = device;
+	data->m_maxSize = maxSize;
+	data->m_streamCountKernel = device->getKernel( PATH, RADIXSORT32_KERNEL0, 0, src[TYPE] );
+	data->m_streamCountSortDataKernel = device->getKernel( PATH, RADIXSORT32_KERNEL5, 0, src[TYPE] );
+
+	
+
+	data->m_prefixScanKernel = device->getKernel( PATH, RADIXSORT32_KERNEL1, 0, src[TYPE] );
+	data->m_sortAndScatterKernel = device->getKernel( PATH, RADIXSORT32_KERNEL2, 0, src[TYPE] );
+	data->m_sortAndScatterKeyValueKernel = device->getKernel( PATH, RADIXSORT32_KERNEL3, 0, src[TYPE] );
+	data->m_sortAndScatterSortDataKernel = device->getKernel( PATH, RADIXSORT32_KERNEL4, 0, src[TYPE] );
+
+	int wtf = NUM_WGS*(1<<BITS_PER_PASS);
+
+	data->m_workBuffer0 = new Buffer<u32>( device, maxSize );
+	data->m_workBuffer1 = new Buffer<u32>( device , wtf );
+	data->m_workBuffer2 = new Buffer<u32>( device, maxSize );
+	data->m_workBuffer3 = new Buffer<SortData>(device,maxSize);
+
+
+	for(int i=0; i<32/BITS_PER_PASS; i++)
+		data->m_constBuffer[i] = new Buffer<ConstData>( device, 1, BufferBase::BUFFER_CONST );
+
+	data->m_copyData = Copy<TYPE>::allocate( device );
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void RadixSort32<TYPE>::deallocate( Data* data )
+{
+	delete data->m_workBuffer0;
+	delete data->m_workBuffer1;
+	delete data->m_workBuffer2;
+	delete data->m_workBuffer3;
+
+	for(int i=0; i<32/BITS_PER_PASS; i++)
+		delete data->m_constBuffer[i];
+
+	Copy<TYPE>::deallocate( data->m_copyData );
+
+	delete data;
+}
+
+template<DeviceType TYPE>
+void RadixSort32<TYPE>::execute(Data* data, Buffer<u32>& inout, int n, int sortBits /* = 32 */ )
+{
+	ADLASSERT( n%DATA_ALIGNMENT == 0 );
+	ADLASSERT( n <= data->m_maxSize );
+//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+	ADLASSERT( BITS_PER_PASS == 4 );
+	ADLASSERT( WG_SIZE == 64 );
+	ADLASSERT( (sortBits&0x3) == 0 );
+
+	Buffer<u32>* src = &inout;
+	Buffer<u32>* dst = data->m_workBuffer0;
+	Buffer<u32>* histogramBuffer = data->m_workBuffer1;
+
+	int nWGs = NUM_WGS;
+	ConstData cdata;
+	{
+		int nBlocks = (n+ELEMENTS_PER_WORK_ITEM*WG_SIZE-1)/(ELEMENTS_PER_WORK_ITEM*WG_SIZE);
+
+		cdata.m_n = n;
+		cdata.m_nWGs = NUM_WGS;
+		cdata.m_startBit = 0;
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
+
+		if( nBlocks < NUM_WGS )
+		{
+			cdata.m_nBlocksPerWG = 1;
+			nWGs = nBlocks;
+		}
+	}
+
+	for(int ib=0; ib<sortBits; ib+=4)
+	{
+		cdata.m_startBit = ib;
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_streamCountKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( NUM_WGS*WG_SIZE, WG_SIZE );
+		}
+		{//	prefix scan group histogram
+			BufferInfo bInfo[] = { BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_prefixScanKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( 128, 128 );
+		}
+		{//	local sort and distribute
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer, true ), BufferInfo( dst ) };
+			Launcher launcher( data->m_device, data->m_sortAndScatterKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
+		}
+		swap2( src, dst );
+	}
+
+	if( src != &inout )
+	{
+		Copy<TYPE>::execute( data->m_copyData, (Buffer<float>&)inout, (Buffer<float>&)*src, n );
+	}
+}
+
+template<DeviceType TYPE>
+void RadixSort32<TYPE>::execute(Data* data, Buffer<u32>& in, Buffer<u32>& out, int n, int sortBits /* = 32 */ )
+{
+	ADLASSERT( n%DATA_ALIGNMENT == 0 );
+	ADLASSERT( n <= data->m_maxSize );
+//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+	ADLASSERT( BITS_PER_PASS == 4 );
+	ADLASSERT( WG_SIZE == 64 );
+	ADLASSERT( (sortBits&0x3) == 0 );
+
+	Buffer<u32>* src = &in;
+	Buffer<u32>* dst = data->m_workBuffer0;
+	Buffer<u32>* histogramBuffer = data->m_workBuffer1;
+
+	int nWGs = NUM_WGS;
+	ConstData cdata;
+	{
+		int nBlocks = (n+ELEMENTS_PER_WORK_ITEM*WG_SIZE-1)/(ELEMENTS_PER_WORK_ITEM*WG_SIZE);
+		cdata.m_n = n;
+		cdata.m_nWGs = NUM_WGS;
+		cdata.m_startBit = 0;
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
+		if( nBlocks < NUM_WGS )
+		{
+			cdata.m_nBlocksPerWG = 1;
+			nWGs = nBlocks;
+		}
+	}
+
+	if( sortBits == 4 ) dst = &out;
+
+	for(int ib=0; ib<sortBits; ib+=4)
+	{
+		if( ib==4 )
+		{
+			dst = &out;
+		}
+
+		cdata.m_startBit = ib;
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_streamCountKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( NUM_WGS*WG_SIZE, WG_SIZE );
+		}
+		{//	prefix scan group histogram
+			BufferInfo bInfo[] = { BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_prefixScanKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( 128, 128 );
+		}
+		{//	local sort and distribute
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer, true ), BufferInfo( dst ) };
+			Launcher launcher( data->m_device, data->m_sortAndScatterKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
+		}
+		swap2( src, dst );
+	}
+}
+
+template<DeviceType TYPE>
+void RadixSort32<TYPE>::execute(Data* data, Buffer<u32>& keysIn, Buffer<u32>& keysOut, Buffer<u32>& valuesIn, Buffer<u32>& valuesOut, int n, int sortBits /* = 32 */)
+{
+	ADLASSERT( n%DATA_ALIGNMENT == 0 );
+	ADLASSERT( n <= data->m_maxSize );
+//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+	ADLASSERT( BITS_PER_PASS == 4 );
+	ADLASSERT( WG_SIZE == 64 );
+	ADLASSERT( (sortBits&0x3) == 0 );
+
+	Buffer<u32>* src = &keysIn;
+	Buffer<u32>* srcVal = &valuesIn;
+	Buffer<u32>* dst = data->m_workBuffer0;
+	Buffer<u32>* dstVal = data->m_workBuffer2;
+	Buffer<u32>* histogramBuffer = data->m_workBuffer1;
+
+	int nWGs = NUM_WGS;
+	ConstData cdata;
+	{
+		int nBlocks = (n+ELEMENTS_PER_WORK_ITEM*WG_SIZE-1)/(ELEMENTS_PER_WORK_ITEM*WG_SIZE);
+		cdata.m_n = n;
+		cdata.m_nWGs = NUM_WGS;
+		cdata.m_startBit = 0;
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
+		if( nBlocks < NUM_WGS )
+		{
+			cdata.m_nBlocksPerWG = 1;
+			nWGs = nBlocks;
+		}
+	}
+
+	if( sortBits == 4 ) 
+	{
+		dst = &keysOut;
+		dstVal = &valuesOut;
+	}
+
+	for(int ib=0; ib<sortBits; ib+=4)
+	{
+		if( ib==4 )
+		{
+			dst = &keysOut;
+			dstVal = &valuesOut;
+		}
+
+		cdata.m_startBit = ib;
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_streamCountKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( NUM_WGS*WG_SIZE, WG_SIZE );
+		}
+		{//	prefix scan group histogram
+			BufferInfo bInfo[] = { BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_prefixScanKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( 128, 128 );
+		}
+		{//	local sort and distribute
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( srcVal, true ), BufferInfo( histogramBuffer, true ), BufferInfo( dst ), BufferInfo( dstVal ) };
+			Launcher launcher( data->m_device, data->m_sortAndScatterKeyValueKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
+		}
+		swap2( src, dst );
+		swap2( srcVal, dstVal );
+	}
+}
+
+template<DeviceType TYPE>
+void RadixSort32<TYPE>::execute(Data* data, Buffer<SortData>& keyValuesInOut, int n, int sortBits /* = 32 */)
+{
+	ADLASSERT( n%DATA_ALIGNMENT == 0 );
+	ADLASSERT( n <= data->m_maxSize );
+//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+	ADLASSERT( BITS_PER_PASS == 4 );
+	ADLASSERT( WG_SIZE == 64 );
+	ADLASSERT( (sortBits&0x3) == 0 );
+
+	Buffer<SortData>* src = &keyValuesInOut;
+	Buffer<SortData>* dst = data->m_workBuffer3;
+
+	Buffer<u32>* histogramBuffer = data->m_workBuffer1;
+
+	int nWGs = NUM_WGS;
+	ConstData cdata;
+	{
+		int nBlocks = (n+ELEMENTS_PER_WORK_ITEM*WG_SIZE-1)/(ELEMENTS_PER_WORK_ITEM*WG_SIZE);
+		cdata.m_n = n;
+		cdata.m_nWGs = NUM_WGS;
+		cdata.m_startBit = 0;
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
+		if( nBlocks < NUM_WGS )
+		{
+			cdata.m_nBlocksPerWG = 1;
+			nWGs = nBlocks;
+		}
+	}
+
+	int count=0;
+	for(int ib=0; ib<sortBits; ib+=4)
+	{
+		cdata.m_startBit = ib;
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_streamCountSortDataKernel);
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( NUM_WGS*WG_SIZE, WG_SIZE );
+		}
+		{//	prefix scan group histogram
+			BufferInfo bInfo[] = { BufferInfo( histogramBuffer ) };
+			Launcher launcher( data->m_device, data->m_prefixScanKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( 128, 128 );
+		}
+		{//	local sort and distribute
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( histogramBuffer, true ), BufferInfo( dst )};
+			Launcher launcher( data->m_device, data->m_sortAndScatterSortDataKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[ib/4], cdata );
+			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
+		}
+		swap2( src, dst );
+		count++;
+	}
+	
+	if (count&1)
+	{
+		ADLASSERT(0);//need to copy from workbuffer to keyValuesInOut
+
+	}
+}
+#undef PATH
+#undef RADIXSORT32_KERNEL0
+#undef RADIXSORT32_KERNEL1
+#undef RADIXSORT32_KERNEL2
+#undef RADIXSORT32_KERNEL3
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32Host.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32Host.inl
@@ -0,0 +1,163 @@
+/*
+		2011 Takahiro Harada
+*/
+
+template<>
+class RadixSort32<TYPE_HOST> : public RadixSort32Base
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		enum
+		{
+			BITS_PER_PASS = 8,
+			NUM_TABLES = (1<<BITS_PER_PASS),
+		};
+
+		struct Data
+		{
+			HostBuffer<u32>* m_workBuffer;
+		};
+
+		static
+		Data* allocate(const Device* device, int maxSize)
+		{
+			ADLASSERT( device->m_type == TYPE_HOST );
+
+			Data* data = new Data;
+			data->m_workBuffer = new HostBuffer<u32>( device, maxSize );
+			return data;
+		}
+
+		static
+		void deallocate(Data* data)
+		{
+			delete data->m_workBuffer;
+			delete data;
+		}
+
+		static
+		void execute(Data* data, Buffer<u32>& inout, int n, int sortBits = 32)
+		{
+			ADLASSERT( inout.getType() == TYPE_HOST );
+
+			int tables[NUM_TABLES];
+			int counter[NUM_TABLES];
+
+			u32* src = inout.m_ptr;
+			u32* dst = data->m_workBuffer->m_ptr;
+
+			for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
+			{
+				for(int i=0; i<NUM_TABLES; i++)
+				{
+					tables[i] = 0;
+				}
+
+				for(int i=0; i<n; i++)
+				{
+					int tableIdx = (src[i] >> startBit) & (NUM_TABLES-1);
+					tables[tableIdx]++;
+				}
+
+				//	prefix scan
+				int sum = 0;
+				for(int i=0; i<NUM_TABLES; i++)
+				{
+					int iData = tables[i];
+					tables[i] = sum;
+					sum += iData;
+					counter[i] = 0;
+				}
+
+				//	distribute
+				for(int i=0; i<n; i++)
+				{
+					int tableIdx = (src[i] >> startBit) & (NUM_TABLES-1);
+
+					dst[tables[tableIdx] + counter[tableIdx]] = src[i];
+					counter[tableIdx] ++;
+				}
+
+				swap2( src, dst );
+			}
+
+			{
+				if( src != inout.m_ptr )
+				{
+					memcpy( dst, src, sizeof(u32)*n );
+				}
+			}
+
+		}
+
+		static
+		void execute(Data* data, Buffer<u32>& keyInout, const Buffer<u32>& valueInout, int n, int sortBits = 32)
+		{
+			ADLASSERT( keyInout.getType() == TYPE_HOST );
+
+			int tables[NUM_TABLES];
+			int counter[NUM_TABLES];
+
+			u32* src = keyInout.m_ptr;
+			u32* dst = data->m_workBuffer->m_ptr;
+
+			HostBuffer<u32> bufVal(valueInout.m_device, valueInout.m_size);
+			bufVal.write(valueInout.m_ptr, valueInout.m_size);
+
+			u32* srcVal = valueInout.m_ptr;
+			u32* dstVal = bufVal.m_ptr;
+
+			for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
+			{
+				for(int i=0; i<NUM_TABLES; i++)
+				{
+					tables[i] = 0;
+				}
+
+				for(int i=0; i<n; i++)
+				{
+					int tableIdx = (src[i] >> startBit) & (NUM_TABLES-1);
+					tables[tableIdx]++;
+				}
+
+				//	prefix scan
+				int sum = 0;
+				for(int i=0; i<NUM_TABLES; i++)
+				{
+					int iData = tables[i];
+					tables[i] = sum;
+					sum += iData;
+					counter[i] = 0;
+				}
+
+				//	distribute
+				for(int i=0; i<n; i++)
+				{
+					int tableIdx = (src[i] >> startBit) & (NUM_TABLES-1);
+					int newIdx = tables[tableIdx] + counter[tableIdx];
+					dst[newIdx] = src[i];
+					dstVal[newIdx] = srcVal[i];
+					counter[tableIdx]++;
+				}
+
+				swap2( src, dst );
+				swap2( srcVal, dstVal );
+			}
+
+			{
+				if( src != keyInout.m_ptr )
+				{
+					memcpy( dst, src, sizeof(u32)*n );
+				}
+
+				if( srcVal != valueInout.m_ptr )
+				{
+					memcpy( dstVal, srcVal, sizeof(u32)*n );
+				}
+			}
+
+		}
+};
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32Kernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32Kernels.cl
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32Kernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32Kernels.hlsl
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32KernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32KernelsCL.h
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32KernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSort32KernelsDX11.h
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortAdvancedKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortAdvancedKernels.hlsl
@@ -0,0 +1,985 @@
+/*
+		2011 Takahiro Harada
+*/
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
+#define AtomInc(x) InterlockedAdd(x, 1)
+#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
+
+#define min2 min
+#define max2 max
+
+
+cbuffer CB0 : register( b0 )
+{
+	int m_startBit;
+	int m_totalBlocks;
+	int m_nWorkGroupsToExecute;
+	int m_nBlocksPerGroup;
+
+};
+
+
+typedef struct {
+    unsigned int key;
+    unsigned int value;
+} KeyValuePair;
+
+
+StructuredBuffer<u32> rHistogram : register(t0);
+
+RWStructuredBuffer<KeyValuePair> dataToSort : register( u0 );
+RWStructuredBuffer<KeyValuePair> dataToSortOut : register( u1 );
+
+
+
+#define WG_SIZE 128
+#define ELEMENTS_PER_WORK_ITEM 4
+#define BITS_PER_PASS 4
+#define NUM_BUCKET (1<<BITS_PER_PASS)
+
+
+groupshared u32 sorterSharedMemory[max(WG_SIZE*2*2, WG_SIZE*ELEMENTS_PER_WORK_ITEM*2)];
+groupshared u32 localHistogramToCarry[NUM_BUCKET];
+groupshared u32 localHistogram[NUM_BUCKET*2];
+groupshared u32 localHistogramMat[NUM_BUCKET*WG_SIZE];
+groupshared u32 localPrefixSum[NUM_BUCKET];
+
+
+
+#define SET_LOCAL_SORT_DATA(idx, sortDataIn) sorterSharedMemory[2*(idx)+0] = sortDataIn.key; sorterSharedMemory[2*(idx)+1] = sortDataIn.value; 
+#define GET_LOCAL_SORT_DATA(idx, sortDataOut) sortDataOut.key = sorterSharedMemory[2*(idx)+0]; sortDataOut.value = sorterSharedMemory[2*(idx)+1];
+
+
+
+uint4 prefixScanVector( uint4 data )
+{
+	data.y += data.x;
+	data.w += data.z;
+	data.z += data.y;
+	data.w += data.y;
+	return data;
+}
+
+uint prefixScanVectorEx( inout uint4 data )
+{
+	uint4 backup = data;
+	data.y += data.x;
+	data.w += data.z;
+	data.z += data.y;
+	data.w += data.y;
+	uint sum = data.w;
+	data -= backup;
+	return sum;
+}
+
+uint localPrefixScan128( uint pData, uint lIdx, inout uint totalSum )
+{
+	{	//	Set data
+		sorterSharedMemory[lIdx] = 0;
+		sorterSharedMemory[lIdx+WG_SIZE] = pData;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	Prefix sum
+		int idx = 2*lIdx + (WG_SIZE+1);
+		if( lIdx < 64 )
+		{
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];					
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];			
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
+		}
+		if( lIdx < 64 ) sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
+	}
+
+	GROUP_LDS_BARRIER;
+
+	totalSum = sorterSharedMemory[WG_SIZE*2-1];
+	return sorterSharedMemory[lIdx+127];
+}
+
+void localPrefixScan128Dual( uint pData0, uint pData1, uint lIdx, 
+							inout uint rank0, inout uint rank1,
+							inout uint totalSum0, inout uint totalSum1 )
+{
+	{	//	Set data
+		sorterSharedMemory[lIdx] = 0;
+		sorterSharedMemory[lIdx+WG_SIZE] = pData0;
+		sorterSharedMemory[2*WG_SIZE+lIdx] = 0;
+		sorterSharedMemory[2*WG_SIZE+lIdx+WG_SIZE] = pData1;
+	}
+
+	GROUP_LDS_BARRIER;
+
+//	if( lIdx < 128 ) // todo. assert wg size is 128
+	{	//	Prefix sum
+		int blockIdx = lIdx/64;
+		int groupIdx = lIdx%64;
+		int idx = 2*groupIdx + (WG_SIZE+1) + (2*WG_SIZE)*blockIdx;
+
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-2];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
+
+		sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
+	}
+
+	GROUP_LDS_BARRIER;
+
+	totalSum0 = sorterSharedMemory[WG_SIZE*2-1];
+	rank0 = sorterSharedMemory[lIdx+127];
+	totalSum1 = sorterSharedMemory[2*WG_SIZE+WG_SIZE*2-1];
+	rank1 = sorterSharedMemory[2*WG_SIZE+lIdx+127];
+}
+
+uint4 localPrefixSum128V( uint4 pData, uint lIdx, inout uint totalSum )
+{
+	{	//	Set data
+		sorterSharedMemory[lIdx] = 0;
+		sorterSharedMemory[lIdx+WG_SIZE] = prefixScanVectorEx( pData );
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	Prefix sum
+		int idx = 2*lIdx + (WG_SIZE+1);
+		if( lIdx < 64 )
+		{
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];					
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];			
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
+
+			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
+		}
+	}
+
+	GROUP_LDS_BARRIER;
+
+	totalSum = sorterSharedMemory[WG_SIZE*2-1];
+	uint addValue = sorterSharedMemory[lIdx+127];
+	return pData + uint4(addValue, addValue, addValue, addValue);
+}
+
+void localPrefixSum128Dual( uint4 pData0, uint4 pData1, uint lIdx, 
+						   inout uint4 dataOut0, inout uint4 dataOut1, 
+						   inout uint totalSum0, inout uint totalSum1 )
+{
+/*
+	dataOut0 = localPrefixSum128V( pData0, lIdx, totalSum0 );
+	GROUP_LDS_BARRIER;
+	dataOut1 = localPrefixSum128V( pData1, lIdx, totalSum1 );
+	return;
+*/
+
+	uint4 backup0 = pData0;
+	uint4 backup1 = pData1;
+
+	{	// Prefix sum in a vector
+		pData0 = prefixScanVector( pData0 );
+		pData1 = prefixScanVector( pData1 );
+	}
+
+	{	//	Set data
+		sorterSharedMemory[lIdx] = 0;
+		sorterSharedMemory[lIdx+WG_SIZE] = pData0.w;
+		sorterSharedMemory[2*WG_SIZE+lIdx] = 0;
+		sorterSharedMemory[2*WG_SIZE+lIdx+WG_SIZE] = pData1.w;
+	}
+
+	GROUP_LDS_BARRIER;
+
+//	if( lIdx < 128 ) // todo. assert wg size is 128
+	{	//	Prefix sum
+		int blockIdx = lIdx/64;
+		int groupIdx = lIdx%64;
+		int idx = 2*groupIdx + (WG_SIZE+1) + (2*WG_SIZE)*blockIdx;
+
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-2];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		
+		sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
+
+		sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
+	}
+
+	GROUP_LDS_BARRIER;
+
+	totalSum0 = sorterSharedMemory[WG_SIZE*2-1];
+	{
+		uint addValue = sorterSharedMemory[lIdx+127];
+		dataOut0 = pData0 + uint4(addValue, addValue, addValue, addValue) - backup0;
+	}
+
+	totalSum1 = sorterSharedMemory[2*WG_SIZE+WG_SIZE*2-1];
+	{
+		uint addValue = sorterSharedMemory[2*WG_SIZE+lIdx+127];
+		dataOut1 = pData1 + uint4(addValue, addValue, addValue, addValue) - backup1;
+	}
+}
+
+uint4 extractKeys(uint4 data, uint targetKey)
+{
+	uint4 key;
+	key.x = data.x == targetKey ? 1:0;
+	key.y = data.y == targetKey ? 1:0;
+	key.z = data.z == targetKey ? 1:0;
+	key.w = data.w == targetKey ? 1:0;
+	return key;
+}
+
+uint4 extractKeysByBits(uint4 data, uint targetKey)
+{
+	uint4 key;
+	uint mask = 1<<targetKey;
+	key.x = (data.x & mask) >> targetKey;
+	key.y = (data.y & mask) >> targetKey;
+	key.z = (data.z & mask) >> targetKey;
+	key.w = (data.w & mask) >> targetKey;
+	return key;
+}
+
+uint packKeys(uint lower, uint upper)
+{
+	return lower|(upper<<16);
+}
+
+uint4 packKeys(uint4 lower, uint4 upper)
+{
+	return uint4( lower.x|(upper.x<<16), lower.y|(upper.y<<16), lower.z|(upper.z<<16), lower.w|(upper.w<<16) );
+}
+
+uint extractLower( uint data )
+{
+	return data&0xffff;
+}
+
+uint extractUpper( uint data )
+{
+	return (data>>16)&0xffff;
+}
+
+uint4 extractLower( uint4 data )
+{
+	return uint4( data.x&0xffff, data.y&0xffff, data.z&0xffff, data.w&0xffff );
+}
+
+uint4 extractUpper( uint4 data )
+{
+	return uint4( (data.x>>16)&0xffff, (data.y>>16)&0xffff, (data.z>>16)&0xffff, (data.w>>16)&0xffff );
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void SortAndScatterKernel( DEFAULT_ARGS )        
+{
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+
+	if( lIdx < (NUM_BUCKET) )
+	{
+		localHistogramToCarry[lIdx] = rHistogram[lIdx*m_nWorkGroupsToExecute + wgIdx];
+	}
+
+	GROUP_LDS_BARRIER;
+
+	for(uint igroup=wgIdx*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx+1)*m_nBlocksPerGroup); igroup++)
+	{
+		u32 myHistogram;
+		if( lIdx < (NUM_BUCKET) )
+		{
+			localPrefixSum[lIdx] = 0.f;
+		}
+
+		u32 newOffset[4];
+		KeyValuePair myData[4];
+		{	//	read data
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+			uint startAddress = igroup*numLocalElements + lIdx*4;
+
+			myData[0] = dataToSort[startAddress+0];
+			myData[1] = dataToSort[startAddress+1];
+			myData[2] = dataToSort[startAddress+2];
+			myData[3] = dataToSort[startAddress+3];
+
+			newOffset[0] = newOffset[1] = newOffset[2] = newOffset[3] = 0;
+		}
+
+		int localOffset = 0;
+		uint4 b = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);
+		for(uint targetKey=0; targetKey<(NUM_BUCKET); targetKey+=4)
+		{
+			uint4 key[4];
+			uint keySet[2];
+			{	//	pack 4
+				uint4 scannedKey[4];
+				key[0] = scannedKey[0] = extractKeys( b, targetKey+0 );
+				key[1] = scannedKey[1] = extractKeys( b, targetKey+1 );
+				key[2] = scannedKey[2] = extractKeys( b, targetKey+2 );
+				key[3] = scannedKey[3] = extractKeys( b, targetKey+3 );
+				{
+					uint s[4];
+					s[0] = prefixScanVectorEx( scannedKey[0] );
+					s[1] = prefixScanVectorEx( scannedKey[1] );
+					s[2] = prefixScanVectorEx( scannedKey[2] );
+					s[3] = prefixScanVectorEx( scannedKey[3] );
+					keySet[0] = packKeys( s[0], s[1] );
+					keySet[1] = packKeys( s[2], s[3] );
+				}
+			}
+
+			uint dstAddressBase[4];
+			{
+
+				uint totalSumPacked[2];
+				uint dstAddressPacked[2];
+
+				localPrefixScan128Dual( keySet[0], keySet[1], lIdx, dstAddressPacked[0], dstAddressPacked[1], totalSumPacked[0], totalSumPacked[1] );
+
+				dstAddressBase[0] = extractLower( dstAddressPacked[0] );
+				dstAddressBase[1] = extractUpper( dstAddressPacked[0] );
+				dstAddressBase[2] = extractLower( dstAddressPacked[1] );
+				dstAddressBase[3] = extractUpper( dstAddressPacked[1] );
+
+				uint4 histogram;
+				histogram.x = extractLower(totalSumPacked[0]);
+				histogram.y = extractUpper(totalSumPacked[0]);
+				histogram.z = extractLower(totalSumPacked[1]);
+				histogram.w = extractUpper(totalSumPacked[1]);
+
+				if( lIdx == targetKey + 0 ) myHistogram = histogram.x;
+				else if( lIdx == targetKey + 1 ) myHistogram = histogram.y;
+				else if( lIdx == targetKey + 2 ) myHistogram = histogram.z;
+				else if( lIdx == targetKey + 3 ) myHistogram = histogram.w;
+				
+				uint histogramSum = prefixScanVectorEx( histogram );
+
+				if( lIdx == targetKey + 0 ) localPrefixSum[targetKey+0] = localOffset+histogram.x;
+				else if( lIdx == targetKey + 1 ) localPrefixSum[targetKey+1] = localOffset+histogram.y;
+				else if( lIdx == targetKey + 2 ) localPrefixSum[targetKey+2] = localOffset+histogram.z;
+				else if( lIdx == targetKey + 3 ) localPrefixSum[targetKey+3] = localOffset+histogram.w;
+
+				localOffset += histogramSum;
+			}
+			
+			GROUP_LDS_BARRIER;
+
+
+			for(int ie=0; ie<4; ie++)
+			{
+				uint4 scannedKey = key[ie];
+				prefixScanVectorEx( scannedKey );
+
+				uint offset = localPrefixSum[targetKey + ie] + dstAddressBase[ie];
+				uint4 dstAddress = uint4( offset, offset, offset, offset ) + scannedKey;
+
+				newOffset[0] += dstAddress.x*key[ie].x;
+				newOffset[1] += dstAddress.y*key[ie].y;
+				newOffset[2] += dstAddress.z*key[ie].z;
+				newOffset[3] += dstAddress.w*key[ie].w;
+			}
+		}
+
+		{	//	local scatter
+			SET_LOCAL_SORT_DATA(newOffset[0], myData[0]);
+			SET_LOCAL_SORT_DATA(newOffset[1], myData[1]);
+			SET_LOCAL_SORT_DATA(newOffset[2], myData[2]);
+			SET_LOCAL_SORT_DATA(newOffset[3], myData[3]);
+		}
+
+		GROUP_LDS_BARRIER;
+
+		{	//	write data
+			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
+			{
+				int dataIdx = 4*lIdx+i;
+				KeyValuePair localData; GET_LOCAL_SORT_DATA( dataIdx, localData );
+				int binIdx = (localData.key >> m_startBit) & 0xf;
+				int groupOffset = localHistogramToCarry[binIdx];
+				int myIdx = dataIdx - localPrefixSum[binIdx];
+
+				dataToSortOut[ groupOffset + myIdx ] = localData;
+			}
+		}
+
+		GROUP_LDS_BARRIER;
+		if( lIdx < NUM_BUCKET )
+		{
+			localHistogramToCarry[lIdx] += myHistogram;
+		}
+		GROUP_LDS_BARRIER;
+	}
+}
+
+
+[numthreads(WG_SIZE, 1, 1)]
+void SortAndScatterKernel1( DEFAULT_ARGS )
+{
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+
+	if( lIdx < (NUM_BUCKET) )
+	{
+		localHistogramToCarry[lIdx] = rHistogram[lIdx*m_nWorkGroupsToExecute + wgIdx.x];
+	}
+
+	GROUP_LDS_BARRIER;
+
+	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)
+	{
+		u32 myHistogram;
+
+		KeyValuePair myData[4];
+		uint startAddrBlock;
+		{	//	read data
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+			startAddrBlock = lIdx*4;
+			uint startAddress = igroup*numLocalElements + startAddrBlock;
+
+			myData[0] = dataToSort[startAddress+0];
+			myData[1] = dataToSort[startAddress+1];
+			myData[2] = dataToSort[startAddress+2];
+			myData[3] = dataToSort[startAddress+3];
+		}
+
+		//	local sort
+		for(int ib=m_startBit; ib<m_startBit+BITS_PER_PASS; ib++)
+		{
+			uint4 keys = uint4(~(myData[0].key>>ib) & 0x1, ~(myData[1].key>>ib) & 0x1, ~(myData[2].key>>ib) & 0x1, ~(myData[3].key>>ib) & 0x1);
+			uint total;
+			uint4 rankOfP = localPrefixSum128V( keys, lIdx, total );
+			uint4 rankOfN = uint4(startAddrBlock, startAddrBlock+1, startAddrBlock+2, startAddrBlock+3) - rankOfP + uint4( total, total, total, total );
+
+			uint4 myAddr = (keys==uint4(1,1,1,1))? rankOfP: rankOfN;
+			
+			GROUP_LDS_BARRIER;
+
+			SET_LOCAL_SORT_DATA( myAddr.x, myData[0] );
+			SET_LOCAL_SORT_DATA( myAddr.y, myData[1] );
+			SET_LOCAL_SORT_DATA( myAddr.z, myData[2] );
+			SET_LOCAL_SORT_DATA( myAddr.w, myData[3] );
+
+			GROUP_LDS_BARRIER;
+			
+			GET_LOCAL_SORT_DATA( startAddrBlock+0, myData[0] );
+			GET_LOCAL_SORT_DATA( startAddrBlock+1, myData[1] );
+			GET_LOCAL_SORT_DATA( startAddrBlock+2, myData[2] );
+			GET_LOCAL_SORT_DATA( startAddrBlock+3, myData[3] );
+		}
+
+		{//	create histogram -> prefix sum
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[lIdx] = 0;
+				localHistogram[NUM_BUCKET+lIdx] = 0;
+			}
+			GROUP_LDS_BARRIER;
+			uint4 keys = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);
+			
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.x], 1 );
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.y], 1 );
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.z], 1 );
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.w], 1 );
+			
+			GROUP_LDS_BARRIER;
+			
+			uint hIdx = NUM_BUCKET+lIdx;
+			if( lIdx < NUM_BUCKET )
+			{
+				myHistogram = localHistogram[hIdx];
+			}
+			GROUP_LDS_BARRIER;
+	
+			if( lIdx < NUM_BUCKET )
+			{
+				localHistogram[hIdx] = localHistogram[hIdx-1];
+
+				localHistogram[hIdx] += localHistogram[hIdx-1];
+				localHistogram[hIdx] += localHistogram[hIdx-2];
+				localHistogram[hIdx] += localHistogram[hIdx-4];
+				localHistogram[hIdx] += localHistogram[hIdx-8];
+			}
+
+			GROUP_LDS_BARRIER;
+		}
+/*
+		{//	write back
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+			startAddrBlock = lIdx*4;
+			uint startAddress = igroup*numLocalElements + startAddrBlock;
+
+			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
+			{
+				dataToSortOut[ startAddress+ie ] = myData[ie];
+			}
+		}
+*/
+		{
+			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
+			{
+				int dataIdx = startAddrBlock+ie;
+				int binIdx = (myData[ie].key>>m_startBit)&0xf;
+				int groupOffset = localHistogramToCarry[binIdx];
+				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
+				dataToSortOut[ groupOffset + myIdx ] = myData[ie];
+			}
+		}
+		
+		GROUP_LDS_BARRIER;
+		if( lIdx < NUM_BUCKET )
+		{
+			localHistogramToCarry[lIdx] += myHistogram;
+		}
+		GROUP_LDS_BARRIER;
+	
+	}
+}
+
+/*
+[numthreads(WG_SIZE, 1, 1)]
+void SortAndScatterKernel1( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )
+{
+	if( lIdx.x < (NUM_BUCKET) )
+	{
+		localHistogramToCarry[lIdx.x] = rHistogram[lIdx.x*m_nWorkGroupsToExecute + gIdx.x];
+	}
+
+	GROUP_LDS_BARRIER;
+
+	for(uint igroup=gIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(gIdx.x+1)*m_nBlocksPerGroup); igroup++)
+	{
+		u32 myHistogram;
+
+		KeyValuePair myData[4];
+		uint startAddrBlock;
+		{	//	read data
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+			startAddrBlock = lIdx.x*4;
+			uint startAddress = igroup*numLocalElements + startAddrBlock;
+
+			myData[0] = dataToSort[startAddress+0];
+			myData[1] = dataToSort[startAddress+1];
+			myData[2] = dataToSort[startAddress+2];
+			myData[3] = dataToSort[startAddress+3];
+		}
+
+		for(int ib=m_startBit; ib<m_startBit+BITS_PER_PASS; ib++)
+		{
+			uint4 keys = uint4(~(myData[0].key>>ib) & 0x1, ~(myData[1].key>>ib) & 0x1, ~(myData[2].key>>ib) & 0x1, ~(myData[3].key>>ib) & 0x1);
+			uint total;
+			uint4 rankOfP = localPrefixSum128V( keys, lIdx.x, total );
+			uint4 rankOfN = uint4(startAddrBlock, startAddrBlock+1, startAddrBlock+2, startAddrBlock+3) - rankOfP + uint4( total, total, total, total );
+
+			uint4 myAddr = (keys==uint4(1,1,1,1))? rankOfP: rankOfN;
+			
+			GROUP_LDS_BARRIER;
+
+			SET_LOCAL_SORT_DATA( myAddr.x, myData[0] );
+			SET_LOCAL_SORT_DATA( myAddr.y, myData[1] );
+			SET_LOCAL_SORT_DATA( myAddr.z, myData[2] );
+			SET_LOCAL_SORT_DATA( myAddr.w, myData[3] );
+
+			GROUP_LDS_BARRIER;
+			
+			GET_LOCAL_SORT_DATA( startAddrBlock+0, myData[0] );
+			GET_LOCAL_SORT_DATA( startAddrBlock+1, myData[1] );
+			GET_LOCAL_SORT_DATA( startAddrBlock+2, myData[2] );
+			GET_LOCAL_SORT_DATA( startAddrBlock+3, myData[3] );
+		}
+		
+		{//	create histogram -> prefix sum
+			if( lIdx.x < NUM_BUCKET )
+			{
+				localHistogram[lIdx.x] = 0;
+				localHistogram[NUM_BUCKET+lIdx.x] = 0;
+			}
+			GROUP_LDS_BARRIER;
+			uint4 keys = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);
+			
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.x], 1 );
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.y], 1 );
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.z], 1 );
+			InterlockedAdd( localHistogram[NUM_BUCKET+keys.w], 1 );
+			
+			GROUP_LDS_BARRIER;
+			
+			uint hIdx = NUM_BUCKET+lIdx.x;
+			if( lIdx.x < NUM_BUCKET )
+			{
+				myHistogram = localHistogram[hIdx];
+			}
+			GROUP_LDS_BARRIER;
+	
+
+			if( lIdx.x < NUM_BUCKET )
+			{
+				localHistogram[hIdx] = localHistogram[hIdx-1];
+
+				localHistogram[hIdx] += localHistogram[hIdx-1];
+				localHistogram[hIdx] += localHistogram[hIdx-2];
+				localHistogram[hIdx] += localHistogram[hIdx-4];
+				localHistogram[hIdx] += localHistogram[hIdx-8];
+			}
+
+			GROUP_LDS_BARRIER;
+		}
+		{//	write back
+			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
+			{
+				int dataIdx = startAddrBlock+ie;
+				int binIdx = (myData[ie].key>>m_startBit)&0xf;
+				int groupOffset = localHistogramToCarry[binIdx];
+				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
+				
+				dataToSortOut[ groupOffset + myIdx ] = myData[ie];
+			}
+		}
+		
+		GROUP_LDS_BARRIER;
+		if( lIdx.x < NUM_BUCKET )
+		{
+			localHistogramToCarry[lIdx.x] += myHistogram;
+		}
+		GROUP_LDS_BARRIER;
+	
+	}
+}
+*/
+
+StructuredBuffer<KeyValuePair> dataToSort1 : register( t0 );
+RWStructuredBuffer<u32> wHistogram1 : register(u0);
+
+#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx.x]
+
+[numthreads(WG_SIZE, 1, 1)]
+void StreamCountKernel( DEFAULT_ARGS )        
+{
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+
+	int myHistogram[NUM_BUCKET];
+
+	for(int i=0; i<NUM_BUCKET; i++)
+	{
+		MY_HISTOGRAM(i) = 0;
+	}
+
+	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)
+	{
+		uint localKeys[4];
+		{	//	read data
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+
+			uint4 localAddress = uint4(lIdx, lIdx, lIdx, lIdx)*4+uint4(0,1,2,3);
+			uint4 globalAddress = uint4(igroup,igroup,igroup,igroup)*numLocalElements + localAddress;
+
+			KeyValuePair localData0 = dataToSort1[globalAddress.x];
+			KeyValuePair localData1 = dataToSort1[globalAddress.y];
+			KeyValuePair localData2 = dataToSort1[globalAddress.z];
+			KeyValuePair localData3 = dataToSort1[globalAddress.w];
+
+			localKeys[0] = (localData0.key >> m_startBit) & 0xf;
+			localKeys[1] = (localData1.key >> m_startBit) & 0xf;
+			localKeys[2] = (localData2.key >> m_startBit) & 0xf;
+			localKeys[3] = (localData3.key >> m_startBit) & 0xf;
+		}
+
+		MY_HISTOGRAM( localKeys[0] )++;
+		MY_HISTOGRAM( localKeys[1] )++;
+		MY_HISTOGRAM( localKeys[2] )++;
+		MY_HISTOGRAM( localKeys[3] )++;
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	reduce to 1
+		if( lIdx < 64 )//WG_SIZE/2 )
+		{
+			for(int i=0; i<NUM_BUCKET/2; i++)
+			{
+				int idx = lIdx;
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];
+			}
+		}
+		else if( lIdx < 128 )
+		{
+			for(int i=NUM_BUCKET/2; i<NUM_BUCKET; i++)
+			{
+				int idx = lIdx-64;
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];
+			}
+		}
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	write data
+		if( lIdx < NUM_BUCKET )
+		{
+			wHistogram1[ lIdx*m_nWorkGroupsToExecute + wgIdx.x ] = localHistogramMat[ lIdx*WG_SIZE+0 ];
+		}
+	}
+}
+
+/*
+[numthreads(WG_SIZE, 1, 1)]
+void StreamCountKernel( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )        
+{
+	int myHistogram[NUM_BUCKET];
+
+	for(int i=0; i<NUM_BUCKET; i++)
+	{
+		myHistogram[i] = 0;
+	}
+
+	for(uint igroup=gIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(gIdx.x+1)*m_nBlocksPerGroup); igroup++)
+	{
+		uint localKeys[4];
+		{	//	read data
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+
+			uint4 localAddress = uint4(lIdx.x, lIdx.x, lIdx.x, lIdx.x)*4+uint4(0,1,2,3);
+			uint4 globalAddress = uint4(igroup,igroup,igroup,igroup)*numLocalElements + localAddress;
+
+			KeyValuePair localData0 = dataToSort1[globalAddress.x];
+			KeyValuePair localData1 = dataToSort1[globalAddress.y];
+			KeyValuePair localData2 = dataToSort1[globalAddress.z];
+			KeyValuePair localData3 = dataToSort1[globalAddress.w];
+
+			localKeys[0] = (localData0.key >> m_startBit) & 0xf;
+			localKeys[1] = (localData1.key >> m_startBit) & 0xf;
+			localKeys[2] = (localData2.key >> m_startBit) & 0xf;
+			localKeys[3] = (localData3.key >> m_startBit) & 0xf;
+		}
+
+		myHistogram[ localKeys[0] ]++;
+		myHistogram[ localKeys[1] ]++;
+		myHistogram[ localKeys[2] ]++;
+		myHistogram[ localKeys[3] ]++;
+	}
+
+	{	//	move to shared
+		for(int i=0; i<NUM_BUCKET; i++)
+		{
+			localHistogramMat[i*WG_SIZE+lIdx.x] = myHistogram[i];
+		}
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	reduce to 1
+		if( lIdx.x < 64 )//WG_SIZE/2 )
+		{
+			for(int i=0; i<NUM_BUCKET/2; i++)
+			{
+				int idx = lIdx.x;
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];
+			}
+		}
+		else if( lIdx.x < 128 )
+		{
+			for(int i=NUM_BUCKET/2; i<NUM_BUCKET; i++)
+			{
+				int idx = lIdx.x-64;
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];
+				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];
+			}
+		}
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	write data
+		if( lIdx.x < NUM_BUCKET )
+		{
+			wHistogram1[ lIdx.x*m_nWorkGroupsToExecute + gIdx.x ] = localHistogramMat[ lIdx.x*WG_SIZE+0 ];
+		}
+	}
+}
+*/
+
+/*
+//	for MAX_WG_SIZE 20
+[numthreads(WG_SIZE, 1, 1)]
+void PrefixScanKernel( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )        
+{
+	uint4 myData = uint4(0,0,0,0);
+	if( 4*lIdx.x+0 < NUM_BUCKET*m_nWorkGroupsToExecute )
+		myData.x = wHistogram1[4*lIdx.x+0];
+	if( 4*lIdx.x+1 < NUM_BUCKET*m_nWorkGroupsToExecute )
+		myData.y = wHistogram1[4*lIdx.x+1];
+	if( 4*lIdx.x+2 < NUM_BUCKET*m_nWorkGroupsToExecute )
+		myData.z = wHistogram1[4*lIdx.x+2];
+	if( 4*lIdx.x+3 < NUM_BUCKET*m_nWorkGroupsToExecute )
+		myData.w = wHistogram1[4*lIdx.x+3];
+
+	uint totalSum;
+
+	uint4 scanned = localPrefixSum128V( myData, lIdx.x, totalSum );
+
+	wHistogram1[4*lIdx.x+0] = scanned.x;
+	wHistogram1[4*lIdx.x+1] = scanned.y;
+	wHistogram1[4*lIdx.x+2] = scanned.z;
+	wHistogram1[4*lIdx.x+3] = scanned.w;
+}
+*/
+
+//	for MAX_WG_SIZE 80
+//	can hold up to WG_SIZE*12 (128*12 > 80*16 )
+[numthreads(WG_SIZE, 1, 1)]
+void PrefixScanKernel( DEFAULT_ARGS )
+{
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+
+	uint data[12] = {0,0,0,0,0,0,0,0,0,0,0,0};
+	for(int i=0; i<12; i++)
+	{
+		if( int(12*lIdx+i) < NUM_BUCKET*m_nWorkGroupsToExecute )
+			data[i] = wHistogram1[12*lIdx+i];
+	}
+
+	uint4 myData = uint4(0,0,0,0);
+	myData.x = data[0] + data[1];
+	myData.y = data[2] + data[3];
+	myData.z = data[4] + data[5];
+	myData.w = data[6] + data[7];
+
+
+	uint totalSum;
+	uint4 scanned = localPrefixSum128V( myData, lIdx, totalSum );
+
+	data[11] = scanned.w + data[9] + data[10];
+	data[10] = scanned.w + data[9];
+	data[9] = scanned.w;
+	data[8] = scanned.z + data[6] + data[7];
+	data[7] = scanned.z + data[6];
+	data[6] = scanned.z;
+	data[5] = scanned.y + data[3] + data[4];
+	data[4] = scanned.y + data[3];
+	data[3] = scanned.y;
+	data[2] = scanned.x + data[0] + data[1];
+	data[1] = scanned.x + data[0];
+	data[0] = scanned.x;
+
+	for(int i=0; i<12; i++)
+	{
+		wHistogram1[12*lIdx+i] = data[i];
+	}
+}
+/*
+[numthreads(WG_SIZE, 1, 1)]
+void PrefixScanKernel( DEFAULT_ARGS )
+{
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+
+	uint data[8] = {0,0,0,0,0,0,0,0};
+	for(int i=0; i<8; i++)
+	{
+		if( int(8*lIdx+i) < NUM_BUCKET*m_nWorkGroupsToExecute )
+			data[i] = wHistogram1[8*lIdx+i];
+	}
+
+	uint4 myData = uint4(0,0,0,0);
+	myData.x = data[0] + data[1];
+	myData.y = data[2] + data[3];
+	myData.z = data[4] + data[5];
+	myData.w = data[6] + data[7];
+
+
+	uint totalSum;
+	uint4 scanned = localPrefixSum128V( myData, lIdx, totalSum );
+
+	data[7] = scanned.w + data[6];
+	data[6] = scanned.w;// + data[5];
+	data[5] = scanned.z + data[4];
+	data[4] = scanned.z;// + data[3];
+	data[3] = scanned.y + data[2];
+	data[2] = scanned.y;// + data[1];
+	data[1] = scanned.x + data[0];
+	data[0] = scanned.x;
+
+	for(int i=0; i<8; i++)
+	{
+		wHistogram1[8*lIdx+i] = data[i];
+	}
+}
+*/
+
+
+[numthreads(WG_SIZE, 1, 1)]
+void CopyKernel( DEFAULT_ARGS )
+{
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+
+	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)
+	{
+		KeyValuePair myData[4];
+		uint startAddrBlock;
+		{	//	read data
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+			startAddrBlock = lIdx*4;
+			uint startAddress = igroup*numLocalElements + startAddrBlock;
+
+			myData[0] = dataToSort[startAddress+0];
+			myData[1] = dataToSort[startAddress+1];
+			myData[2] = dataToSort[startAddress+2];
+			myData[3] = dataToSort[startAddress+3];
+		}
+
+		{
+			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;
+			uint startAddress = igroup*numLocalElements + startAddrBlock;
+
+			dataToSortOut[startAddress+0] = myData[0];
+			dataToSortOut[startAddress+1] = myData[1];
+			dataToSortOut[startAddress+2] = myData[2];
+			dataToSortOut[startAddress+3] = myData[3];
+		}
+	}
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortAdvancedKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortAdvancedKernelsDX11.h
@@ -0,0 +1,987 @@
+static const char* radixSortAdvancedKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+"\n"
+"#define min2 min\n"
+"#define max2 max\n"
+"\n"
+"\n"
+"cbuffer CB0 : register( b0 )\n"
+"{\n"
+"	int m_startBit;\n"
+"	int m_totalBlocks;\n"
+"	int m_nWorkGroupsToExecute;\n"
+"	int m_nBlocksPerGroup;\n"
+"\n"
+"};\n"
+"\n"
+"\n"
+"typedef struct {\n"
+"    unsigned int key;\n"
+"    unsigned int value;\n"
+"} KeyValuePair;\n"
+"\n"
+"\n"
+"StructuredBuffer<u32> rHistogram : register(t0);\n"
+"\n"
+"RWStructuredBuffer<KeyValuePair> dataToSort : register( u0 );\n"
+"RWStructuredBuffer<KeyValuePair> dataToSortOut : register( u1 );\n"
+"\n"
+"\n"
+"\n"
+"#define WG_SIZE 128\n"
+"#define ELEMENTS_PER_WORK_ITEM 4\n"
+"#define BITS_PER_PASS 4\n"
+"#define NUM_BUCKET (1<<BITS_PER_PASS)\n"
+"\n"
+"\n"
+"groupshared u32 sorterSharedMemory[max(WG_SIZE*2*2, WG_SIZE*ELEMENTS_PER_WORK_ITEM*2)];\n"
+"groupshared u32 localHistogramToCarry[NUM_BUCKET];\n"
+"groupshared u32 localHistogram[NUM_BUCKET*2];\n"
+"groupshared u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
+"groupshared u32 localPrefixSum[NUM_BUCKET];\n"
+"\n"
+"\n"
+"\n"
+"#define SET_LOCAL_SORT_DATA(idx, sortDataIn) sorterSharedMemory[2*(idx)+0] = sortDataIn.key; sorterSharedMemory[2*(idx)+1] = sortDataIn.value; \n"
+"#define GET_LOCAL_SORT_DATA(idx, sortDataOut) sortDataOut.key = sorterSharedMemory[2*(idx)+0]; sortDataOut.value = sorterSharedMemory[2*(idx)+1];\n"
+"\n"
+"\n"
+"\n"
+"uint4 prefixScanVector( uint4 data )\n"
+"{\n"
+"	data.y += data.x;\n"
+"	data.w += data.z;\n"
+"	data.z += data.y;\n"
+"	data.w += data.y;\n"
+"	return data;\n"
+"}\n"
+"\n"
+"uint prefixScanVectorEx( inout uint4 data )\n"
+"{\n"
+"	uint4 backup = data;\n"
+"	data.y += data.x;\n"
+"	data.w += data.z;\n"
+"	data.z += data.y;\n"
+"	data.w += data.y;\n"
+"	uint sum = data.w;\n"
+"	data -= backup;\n"
+"	return sum;\n"
+"}\n"
+"\n"
+"uint localPrefixScan128( uint pData, uint lIdx, inout uint totalSum )\n"
+"{\n"
+"	{	//	Set data\n"
+"		sorterSharedMemory[lIdx] = 0;\n"
+"		sorterSharedMemory[lIdx+WG_SIZE] = pData;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	Prefix sum\n"
+"		int idx = 2*lIdx + (WG_SIZE+1);\n"
+"		if( lIdx < 64 )\n"
+"		{\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];					\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];			\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+"		}\n"
+"		if( lIdx < 64 ) sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	totalSum = sorterSharedMemory[WG_SIZE*2-1];\n"
+"	return sorterSharedMemory[lIdx+127];\n"
+"}\n"
+"\n"
+"void localPrefixScan128Dual( uint pData0, uint pData1, uint lIdx, \n"
+"							inout uint rank0, inout uint rank1,\n"
+"							inout uint totalSum0, inout uint totalSum1 )\n"
+"{\n"
+"	{	//	Set data\n"
+"		sorterSharedMemory[lIdx] = 0;\n"
+"		sorterSharedMemory[lIdx+WG_SIZE] = pData0;\n"
+"		sorterSharedMemory[2*WG_SIZE+lIdx] = 0;\n"
+"		sorterSharedMemory[2*WG_SIZE+lIdx+WG_SIZE] = pData1;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"//	if( lIdx < 128 ) // todo. assert wg size is 128\n"
+"	{	//	Prefix sum\n"
+"		int blockIdx = lIdx/64;\n"
+"		int groupIdx = lIdx%64;\n"
+"		int idx = 2*groupIdx + (WG_SIZE+1) + (2*WG_SIZE)*blockIdx;\n"
+"\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-2];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+"\n"
+"		sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	totalSum0 = sorterSharedMemory[WG_SIZE*2-1];\n"
+"	rank0 = sorterSharedMemory[lIdx+127];\n"
+"	totalSum1 = sorterSharedMemory[2*WG_SIZE+WG_SIZE*2-1];\n"
+"	rank1 = sorterSharedMemory[2*WG_SIZE+lIdx+127];\n"
+"}\n"
+"\n"
+"uint4 localPrefixSum128V( uint4 pData, uint lIdx, inout uint totalSum )\n"
+"{\n"
+"	{	//	Set data\n"
+"		sorterSharedMemory[lIdx] = 0;\n"
+"		sorterSharedMemory[lIdx+WG_SIZE] = prefixScanVectorEx( pData );\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	Prefix sum\n"
+"		int idx = 2*lIdx + (WG_SIZE+1);\n"
+"		if( lIdx < 64 )\n"
+"		{\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];					\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];			\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+"\n"
+"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+"		}\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	totalSum = sorterSharedMemory[WG_SIZE*2-1];\n"
+"	uint addValue = sorterSharedMemory[lIdx+127];\n"
+"	return pData + uint4(addValue, addValue, addValue, addValue);\n"
+"}\n"
+"\n"
+"void localPrefixSum128Dual( uint4 pData0, uint4 pData1, uint lIdx, \n"
+"						   inout uint4 dataOut0, inout uint4 dataOut1, \n"
+"						   inout uint totalSum0, inout uint totalSum1 )\n"
+"{\n"
+"/*\n"
+"	dataOut0 = localPrefixSum128V( pData0, lIdx, totalSum0 );\n"
+"	GROUP_LDS_BARRIER;\n"
+"	dataOut1 = localPrefixSum128V( pData1, lIdx, totalSum1 );\n"
+"	return;\n"
+"*/\n"
+"\n"
+"	uint4 backup0 = pData0;\n"
+"	uint4 backup1 = pData1;\n"
+"\n"
+"	{	// Prefix sum in a vector\n"
+"		pData0 = prefixScanVector( pData0 );\n"
+"		pData1 = prefixScanVector( pData1 );\n"
+"	}\n"
+"\n"
+"	{	//	Set data\n"
+"		sorterSharedMemory[lIdx] = 0;\n"
+"		sorterSharedMemory[lIdx+WG_SIZE] = pData0.w;\n"
+"		sorterSharedMemory[2*WG_SIZE+lIdx] = 0;\n"
+"		sorterSharedMemory[2*WG_SIZE+lIdx+WG_SIZE] = pData1.w;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"//	if( lIdx < 128 ) // todo. assert wg size is 128\n"
+"	{	//	Prefix sum\n"
+"		int blockIdx = lIdx/64;\n"
+"		int groupIdx = lIdx%64;\n"
+"		int idx = 2*groupIdx + (WG_SIZE+1) + (2*WG_SIZE)*blockIdx;\n"
+"\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-2];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		\n"
+"		sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+"\n"
+"		sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	totalSum0 = sorterSharedMemory[WG_SIZE*2-1];\n"
+"	{\n"
+"		uint addValue = sorterSharedMemory[lIdx+127];\n"
+"		dataOut0 = pData0 + uint4(addValue, addValue, addValue, addValue) - backup0;\n"
+"	}\n"
+"\n"
+"	totalSum1 = sorterSharedMemory[2*WG_SIZE+WG_SIZE*2-1];\n"
+"	{\n"
+"		uint addValue = sorterSharedMemory[2*WG_SIZE+lIdx+127];\n"
+"		dataOut1 = pData1 + uint4(addValue, addValue, addValue, addValue) - backup1;\n"
+"	}\n"
+"}\n"
+"\n"
+"uint4 extractKeys(uint4 data, uint targetKey)\n"
+"{\n"
+"	uint4 key;\n"
+"	key.x = data.x == targetKey ? 1:0;\n"
+"	key.y = data.y == targetKey ? 1:0;\n"
+"	key.z = data.z == targetKey ? 1:0;\n"
+"	key.w = data.w == targetKey ? 1:0;\n"
+"	return key;\n"
+"}\n"
+"\n"
+"uint4 extractKeysByBits(uint4 data, uint targetKey)\n"
+"{\n"
+"	uint4 key;\n"
+"	uint mask = 1<<targetKey;\n"
+"	key.x = (data.x & mask) >> targetKey;\n"
+"	key.y = (data.y & mask) >> targetKey;\n"
+"	key.z = (data.z & mask) >> targetKey;\n"
+"	key.w = (data.w & mask) >> targetKey;\n"
+"	return key;\n"
+"}\n"
+"\n"
+"uint packKeys(uint lower, uint upper)\n"
+"{\n"
+"	return lower|(upper<<16);\n"
+"}\n"
+"\n"
+"uint4 packKeys(uint4 lower, uint4 upper)\n"
+"{\n"
+"	return uint4( lower.x|(upper.x<<16), lower.y|(upper.y<<16), lower.z|(upper.z<<16), lower.w|(upper.w<<16) );\n"
+"}\n"
+"\n"
+"uint extractLower( uint data )\n"
+"{\n"
+"	return data&0xffff;\n"
+"}\n"
+"\n"
+"uint extractUpper( uint data )\n"
+"{\n"
+"	return (data>>16)&0xffff;\n"
+"}\n"
+"\n"
+"uint4 extractLower( uint4 data )\n"
+"{\n"
+"	return uint4( data.x&0xffff, data.y&0xffff, data.z&0xffff, data.w&0xffff );\n"
+"}\n"
+"\n"
+"uint4 extractUpper( uint4 data )\n"
+"{\n"
+"	return uint4( (data.x>>16)&0xffff, (data.y>>16)&0xffff, (data.z>>16)&0xffff, (data.w>>16)&0xffff );\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void SortAndScatterKernel( DEFAULT_ARGS )        \n"
+"{\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"\n"
+"	if( lIdx < (NUM_BUCKET) )\n"
+"	{\n"
+"		localHistogramToCarry[lIdx] = rHistogram[lIdx*m_nWorkGroupsToExecute + wgIdx];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	for(uint igroup=wgIdx*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx+1)*m_nBlocksPerGroup); igroup++)\n"
+"	{\n"
+"		u32 myHistogram;\n"
+"		if( lIdx < (NUM_BUCKET) )\n"
+"		{\n"
+"			localPrefixSum[lIdx] = 0.f;\n"
+"		}\n"
+"\n"
+"		u32 newOffset[4];\n"
+"		KeyValuePair myData[4];\n"
+"		{	//	read data\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"			uint startAddress = igroup*numLocalElements + lIdx*4;\n"
+"\n"
+"			myData[0] = dataToSort[startAddress+0];\n"
+"			myData[1] = dataToSort[startAddress+1];\n"
+"			myData[2] = dataToSort[startAddress+2];\n"
+"			myData[3] = dataToSort[startAddress+3];\n"
+"\n"
+"			newOffset[0] = newOffset[1] = newOffset[2] = newOffset[3] = 0;\n"
+"		}\n"
+"\n"
+"		int localOffset = 0;\n"
+"		uint4 b = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);\n"
+"		for(uint targetKey=0; targetKey<(NUM_BUCKET); targetKey+=4)\n"
+"		{\n"
+"			uint4 key[4];\n"
+"			uint keySet[2];\n"
+"			{	//	pack 4\n"
+"				uint4 scannedKey[4];\n"
+"				key[0] = scannedKey[0] = extractKeys( b, targetKey+0 );\n"
+"				key[1] = scannedKey[1] = extractKeys( b, targetKey+1 );\n"
+"				key[2] = scannedKey[2] = extractKeys( b, targetKey+2 );\n"
+"				key[3] = scannedKey[3] = extractKeys( b, targetKey+3 );\n"
+"				{\n"
+"					uint s[4];\n"
+"					s[0] = prefixScanVectorEx( scannedKey[0] );\n"
+"					s[1] = prefixScanVectorEx( scannedKey[1] );\n"
+"					s[2] = prefixScanVectorEx( scannedKey[2] );\n"
+"					s[3] = prefixScanVectorEx( scannedKey[3] );\n"
+"					keySet[0] = packKeys( s[0], s[1] );\n"
+"					keySet[1] = packKeys( s[2], s[3] );\n"
+"				}\n"
+"			}\n"
+"\n"
+"			uint dstAddressBase[4];\n"
+"			{\n"
+"\n"
+"				uint totalSumPacked[2];\n"
+"				uint dstAddressPacked[2];\n"
+"\n"
+"				localPrefixScan128Dual( keySet[0], keySet[1], lIdx, dstAddressPacked[0], dstAddressPacked[1], totalSumPacked[0], totalSumPacked[1] );\n"
+"\n"
+"				dstAddressBase[0] = extractLower( dstAddressPacked[0] );\n"
+"				dstAddressBase[1] = extractUpper( dstAddressPacked[0] );\n"
+"				dstAddressBase[2] = extractLower( dstAddressPacked[1] );\n"
+"				dstAddressBase[3] = extractUpper( dstAddressPacked[1] );\n"
+"\n"
+"				uint4 histogram;\n"
+"				histogram.x = extractLower(totalSumPacked[0]);\n"
+"				histogram.y = extractUpper(totalSumPacked[0]);\n"
+"				histogram.z = extractLower(totalSumPacked[1]);\n"
+"				histogram.w = extractUpper(totalSumPacked[1]);\n"
+"\n"
+"				if( lIdx == targetKey + 0 ) myHistogram = histogram.x;\n"
+"				else if( lIdx == targetKey + 1 ) myHistogram = histogram.y;\n"
+"				else if( lIdx == targetKey + 2 ) myHistogram = histogram.z;\n"
+"				else if( lIdx == targetKey + 3 ) myHistogram = histogram.w;\n"
+"				\n"
+"				uint histogramSum = prefixScanVectorEx( histogram );\n"
+"\n"
+"				if( lIdx == targetKey + 0 ) localPrefixSum[targetKey+0] = localOffset+histogram.x;\n"
+"				else if( lIdx == targetKey + 1 ) localPrefixSum[targetKey+1] = localOffset+histogram.y;\n"
+"				else if( lIdx == targetKey + 2 ) localPrefixSum[targetKey+2] = localOffset+histogram.z;\n"
+"				else if( lIdx == targetKey + 3 ) localPrefixSum[targetKey+3] = localOffset+histogram.w;\n"
+"\n"
+"				localOffset += histogramSum;\n"
+"			}\n"
+"			\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"\n"
+"			for(int ie=0; ie<4; ie++)\n"
+"			{\n"
+"				uint4 scannedKey = key[ie];\n"
+"				prefixScanVectorEx( scannedKey );\n"
+"\n"
+"				uint offset = localPrefixSum[targetKey + ie] + dstAddressBase[ie];\n"
+"				uint4 dstAddress = uint4( offset, offset, offset, offset ) + scannedKey;\n"
+"\n"
+"				newOffset[0] += dstAddress.x*key[ie].x;\n"
+"				newOffset[1] += dstAddress.y*key[ie].y;\n"
+"				newOffset[2] += dstAddress.z*key[ie].z;\n"
+"				newOffset[3] += dstAddress.w*key[ie].w;\n"
+"			}\n"
+"		}\n"
+"\n"
+"		{	//	local scatter\n"
+"			SET_LOCAL_SORT_DATA(newOffset[0], myData[0]);\n"
+"			SET_LOCAL_SORT_DATA(newOffset[1], myData[1]);\n"
+"			SET_LOCAL_SORT_DATA(newOffset[2], myData[2]);\n"
+"			SET_LOCAL_SORT_DATA(newOffset[3], myData[3]);\n"
+"		}\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		{	//	write data\n"
+"			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+"			{\n"
+"				int dataIdx = 4*lIdx+i;\n"
+"				KeyValuePair localData; GET_LOCAL_SORT_DATA( dataIdx, localData );\n"
+"				int binIdx = (localData.key >> m_startBit) & 0xf;\n"
+"				int groupOffset = localHistogramToCarry[binIdx];\n"
+"				int myIdx = dataIdx - localPrefixSum[binIdx];\n"
+"\n"
+"				dataToSortOut[ groupOffset + myIdx ] = localData;\n"
+"			}\n"
+"		}\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"		if( lIdx < NUM_BUCKET )\n"
+"		{\n"
+"			localHistogramToCarry[lIdx] += myHistogram;\n"
+"		}\n"
+"		GROUP_LDS_BARRIER;\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void SortAndScatterKernel1( DEFAULT_ARGS )\n"
+"{\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"\n"
+"	if( lIdx < (NUM_BUCKET) )\n"
+"	{\n"
+"		localHistogramToCarry[lIdx] = rHistogram[lIdx*m_nWorkGroupsToExecute + wgIdx.x];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
+"	{\n"
+"		u32 myHistogram;\n"
+"\n"
+"		KeyValuePair myData[4];\n"
+"		uint startAddrBlock;\n"
+"		{	//	read data\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"			startAddrBlock = lIdx*4;\n"
+"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
+"\n"
+"			myData[0] = dataToSort[startAddress+0];\n"
+"			myData[1] = dataToSort[startAddress+1];\n"
+"			myData[2] = dataToSort[startAddress+2];\n"
+"			myData[3] = dataToSort[startAddress+3];\n"
+"		}\n"
+"\n"
+"		//	local sort\n"
+"		for(int ib=m_startBit; ib<m_startBit+BITS_PER_PASS; ib++)\n"
+"		{\n"
+"			uint4 keys = uint4(~(myData[0].key>>ib) & 0x1, ~(myData[1].key>>ib) & 0x1, ~(myData[2].key>>ib) & 0x1, ~(myData[3].key>>ib) & 0x1);\n"
+"			uint total;\n"
+"			uint4 rankOfP = localPrefixSum128V( keys, lIdx, total );\n"
+"			uint4 rankOfN = uint4(startAddrBlock, startAddrBlock+1, startAddrBlock+2, startAddrBlock+3) - rankOfP + uint4( total, total, total, total );\n"
+"\n"
+"			uint4 myAddr = (keys==uint4(1,1,1,1))? rankOfP: rankOfN;\n"
+"			\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			SET_LOCAL_SORT_DATA( myAddr.x, myData[0] );\n"
+"			SET_LOCAL_SORT_DATA( myAddr.y, myData[1] );\n"
+"			SET_LOCAL_SORT_DATA( myAddr.z, myData[2] );\n"
+"			SET_LOCAL_SORT_DATA( myAddr.w, myData[3] );\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"			\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+0, myData[0] );\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+1, myData[1] );\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+2, myData[2] );\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+3, myData[3] );\n"
+"		}\n"
+"\n"
+"		{//	create histogram -> prefix sum\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[lIdx] = 0;\n"
+"				localHistogram[NUM_BUCKET+lIdx] = 0;\n"
+"			}\n"
+"			GROUP_LDS_BARRIER;\n"
+"			uint4 keys = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);\n"
+"			\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.x], 1 );\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.y], 1 );\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.z], 1 );\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.w], 1 );\n"
+"			\n"
+"			GROUP_LDS_BARRIER;\n"
+"			\n"
+"			uint hIdx = NUM_BUCKET+lIdx;\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				myHistogram = localHistogram[hIdx];\n"
+"			}\n"
+"			GROUP_LDS_BARRIER;\n"
+"	\n"
+"			if( lIdx < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+"\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
+"			}\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"		}\n"
+"/*\n"
+"		{//	write back\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"			startAddrBlock = lIdx*4;\n"
+"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
+"\n"
+"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
+"			{\n"
+"				dataToSortOut[ startAddress+ie ] = myData[ie];\n"
+"			}\n"
+"		}\n"
+"*/\n"
+"		{\n"
+"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
+"			{\n"
+"				int dataIdx = startAddrBlock+ie;\n"
+"				int binIdx = (myData[ie].key>>m_startBit)&0xf;\n"
+"				int groupOffset = localHistogramToCarry[binIdx];\n"
+"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
+"				dataToSortOut[ groupOffset + myIdx ] = myData[ie];\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		GROUP_LDS_BARRIER;\n"
+"		if( lIdx < NUM_BUCKET )\n"
+"		{\n"
+"			localHistogramToCarry[lIdx] += myHistogram;\n"
+"		}\n"
+"		GROUP_LDS_BARRIER;\n"
+"	\n"
+"	}\n"
+"}\n"
+"\n"
+"/*\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void SortAndScatterKernel1( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )\n"
+"{\n"
+"	if( lIdx.x < (NUM_BUCKET) )\n"
+"	{\n"
+"		localHistogramToCarry[lIdx.x] = rHistogram[lIdx.x*m_nWorkGroupsToExecute + gIdx.x];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	for(uint igroup=gIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(gIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
+"	{\n"
+"		u32 myHistogram;\n"
+"\n"
+"		KeyValuePair myData[4];\n"
+"		uint startAddrBlock;\n"
+"		{	//	read data\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"			startAddrBlock = lIdx.x*4;\n"
+"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
+"\n"
+"			myData[0] = dataToSort[startAddress+0];\n"
+"			myData[1] = dataToSort[startAddress+1];\n"
+"			myData[2] = dataToSort[startAddress+2];\n"
+"			myData[3] = dataToSort[startAddress+3];\n"
+"		}\n"
+"\n"
+"		for(int ib=m_startBit; ib<m_startBit+BITS_PER_PASS; ib++)\n"
+"		{\n"
+"			uint4 keys = uint4(~(myData[0].key>>ib) & 0x1, ~(myData[1].key>>ib) & 0x1, ~(myData[2].key>>ib) & 0x1, ~(myData[3].key>>ib) & 0x1);\n"
+"			uint total;\n"
+"			uint4 rankOfP = localPrefixSum128V( keys, lIdx.x, total );\n"
+"			uint4 rankOfN = uint4(startAddrBlock, startAddrBlock+1, startAddrBlock+2, startAddrBlock+3) - rankOfP + uint4( total, total, total, total );\n"
+"\n"
+"			uint4 myAddr = (keys==uint4(1,1,1,1))? rankOfP: rankOfN;\n"
+"			\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			SET_LOCAL_SORT_DATA( myAddr.x, myData[0] );\n"
+"			SET_LOCAL_SORT_DATA( myAddr.y, myData[1] );\n"
+"			SET_LOCAL_SORT_DATA( myAddr.z, myData[2] );\n"
+"			SET_LOCAL_SORT_DATA( myAddr.w, myData[3] );\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"			\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+0, myData[0] );\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+1, myData[1] );\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+2, myData[2] );\n"
+"			GET_LOCAL_SORT_DATA( startAddrBlock+3, myData[3] );\n"
+"		}\n"
+"		\n"
+"		{//	create histogram -> prefix sum\n"
+"			if( lIdx.x < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[lIdx.x] = 0;\n"
+"				localHistogram[NUM_BUCKET+lIdx.x] = 0;\n"
+"			}\n"
+"			GROUP_LDS_BARRIER;\n"
+"			uint4 keys = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);\n"
+"			\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.x], 1 );\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.y], 1 );\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.z], 1 );\n"
+"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.w], 1 );\n"
+"			\n"
+"			GROUP_LDS_BARRIER;\n"
+"			\n"
+"			uint hIdx = NUM_BUCKET+lIdx.x;\n"
+"			if( lIdx.x < NUM_BUCKET )\n"
+"			{\n"
+"				myHistogram = localHistogram[hIdx];\n"
+"			}\n"
+"			GROUP_LDS_BARRIER;\n"
+"	\n"
+"\n"
+"			if( lIdx.x < NUM_BUCKET )\n"
+"			{\n"
+"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+"\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
+"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
+"			}\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"		}\n"
+"		{//	write back\n"
+"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
+"			{\n"
+"				int dataIdx = startAddrBlock+ie;\n"
+"				int binIdx = (myData[ie].key>>m_startBit)&0xf;\n"
+"				int groupOffset = localHistogramToCarry[binIdx];\n"
+"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
+"				\n"
+"				dataToSortOut[ groupOffset + myIdx ] = myData[ie];\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		GROUP_LDS_BARRIER;\n"
+"		if( lIdx.x < NUM_BUCKET )\n"
+"		{\n"
+"			localHistogramToCarry[lIdx.x] += myHistogram;\n"
+"		}\n"
+"		GROUP_LDS_BARRIER;\n"
+"	\n"
+"	}\n"
+"}\n"
+"*/\n"
+"\n"
+"StructuredBuffer<KeyValuePair> dataToSort1 : register( t0 );\n"
+"RWStructuredBuffer<u32> wHistogram1 : register(u0);\n"
+"\n"
+"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx.x]\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void StreamCountKernel( DEFAULT_ARGS )        \n"
+"{\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"\n"
+"	int myHistogram[NUM_BUCKET];\n"
+"\n"
+"	for(int i=0; i<NUM_BUCKET; i++)\n"
+"	{\n"
+"		MY_HISTOGRAM(i) = 0;\n"
+"	}\n"
+"\n"
+"	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
+"	{\n"
+"		uint localKeys[4];\n"
+"		{	//	read data\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"\n"
+"			uint4 localAddress = uint4(lIdx, lIdx, lIdx, lIdx)*4+uint4(0,1,2,3);\n"
+"			uint4 globalAddress = uint4(igroup,igroup,igroup,igroup)*numLocalElements + localAddress;\n"
+"\n"
+"			KeyValuePair localData0 = dataToSort1[globalAddress.x];\n"
+"			KeyValuePair localData1 = dataToSort1[globalAddress.y];\n"
+"			KeyValuePair localData2 = dataToSort1[globalAddress.z];\n"
+"			KeyValuePair localData3 = dataToSort1[globalAddress.w];\n"
+"\n"
+"			localKeys[0] = (localData0.key >> m_startBit) & 0xf;\n"
+"			localKeys[1] = (localData1.key >> m_startBit) & 0xf;\n"
+"			localKeys[2] = (localData2.key >> m_startBit) & 0xf;\n"
+"			localKeys[3] = (localData3.key >> m_startBit) & 0xf;\n"
+"		}\n"
+"\n"
+"		MY_HISTOGRAM( localKeys[0] )++;\n"
+"		MY_HISTOGRAM( localKeys[1] )++;\n"
+"		MY_HISTOGRAM( localKeys[2] )++;\n"
+"		MY_HISTOGRAM( localKeys[3] )++;\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	reduce to 1\n"
+"		if( lIdx < 64 )//WG_SIZE/2 )\n"
+"		{\n"
+"			for(int i=0; i<NUM_BUCKET/2; i++)\n"
+"			{\n"
+"				int idx = lIdx;\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
+"			}\n"
+"		}\n"
+"		else if( lIdx < 128 )\n"
+"		{\n"
+"			for(int i=NUM_BUCKET/2; i<NUM_BUCKET; i++)\n"
+"			{\n"
+"				int idx = lIdx-64;\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	write data\n"
+"		if( lIdx < NUM_BUCKET )\n"
+"		{\n"
+"			wHistogram1[ lIdx*m_nWorkGroupsToExecute + wgIdx.x ] = localHistogramMat[ lIdx*WG_SIZE+0 ];\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"/*\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void StreamCountKernel( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )        \n"
+"{\n"
+"	int myHistogram[NUM_BUCKET];\n"
+"\n"
+"	for(int i=0; i<NUM_BUCKET; i++)\n"
+"	{\n"
+"		myHistogram[i] = 0;\n"
+"	}\n"
+"\n"
+"	for(uint igroup=gIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(gIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
+"	{\n"
+"		uint localKeys[4];\n"
+"		{	//	read data\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"\n"
+"			uint4 localAddress = uint4(lIdx.x, lIdx.x, lIdx.x, lIdx.x)*4+uint4(0,1,2,3);\n"
+"			uint4 globalAddress = uint4(igroup,igroup,igroup,igroup)*numLocalElements + localAddress;\n"
+"\n"
+"			KeyValuePair localData0 = dataToSort1[globalAddress.x];\n"
+"			KeyValuePair localData1 = dataToSort1[globalAddress.y];\n"
+"			KeyValuePair localData2 = dataToSort1[globalAddress.z];\n"
+"			KeyValuePair localData3 = dataToSort1[globalAddress.w];\n"
+"\n"
+"			localKeys[0] = (localData0.key >> m_startBit) & 0xf;\n"
+"			localKeys[1] = (localData1.key >> m_startBit) & 0xf;\n"
+"			localKeys[2] = (localData2.key >> m_startBit) & 0xf;\n"
+"			localKeys[3] = (localData3.key >> m_startBit) & 0xf;\n"
+"		}\n"
+"\n"
+"		myHistogram[ localKeys[0] ]++;\n"
+"		myHistogram[ localKeys[1] ]++;\n"
+"		myHistogram[ localKeys[2] ]++;\n"
+"		myHistogram[ localKeys[3] ]++;\n"
+"	}\n"
+"\n"
+"	{	//	move to shared\n"
+"		for(int i=0; i<NUM_BUCKET; i++)\n"
+"		{\n"
+"			localHistogramMat[i*WG_SIZE+lIdx.x] = myHistogram[i];\n"
+"		}\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	reduce to 1\n"
+"		if( lIdx.x < 64 )//WG_SIZE/2 )\n"
+"		{\n"
+"			for(int i=0; i<NUM_BUCKET/2; i++)\n"
+"			{\n"
+"				int idx = lIdx.x;\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
+"			}\n"
+"		}\n"
+"		else if( lIdx.x < 128 )\n"
+"		{\n"
+"			for(int i=NUM_BUCKET/2; i<NUM_BUCKET; i++)\n"
+"			{\n"
+"				int idx = lIdx.x-64;\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
+"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	write data\n"
+"		if( lIdx.x < NUM_BUCKET )\n"
+"		{\n"
+"			wHistogram1[ lIdx.x*m_nWorkGroupsToExecute + gIdx.x ] = localHistogramMat[ lIdx.x*WG_SIZE+0 ];\n"
+"		}\n"
+"	}\n"
+"}\n"
+"*/\n"
+"\n"
+"/*\n"
+"//	for MAX_WG_SIZE 20\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void PrefixScanKernel( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )        \n"
+"{\n"
+"	uint4 myData = uint4(0,0,0,0);\n"
+"	if( 4*lIdx.x+0 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
+"		myData.x = wHistogram1[4*lIdx.x+0];\n"
+"	if( 4*lIdx.x+1 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
+"		myData.y = wHistogram1[4*lIdx.x+1];\n"
+"	if( 4*lIdx.x+2 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
+"		myData.z = wHistogram1[4*lIdx.x+2];\n"
+"	if( 4*lIdx.x+3 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
+"		myData.w = wHistogram1[4*lIdx.x+3];\n"
+"\n"
+"	uint totalSum;\n"
+"\n"
+"	uint4 scanned = localPrefixSum128V( myData, lIdx.x, totalSum );\n"
+"\n"
+"	wHistogram1[4*lIdx.x+0] = scanned.x;\n"
+"	wHistogram1[4*lIdx.x+1] = scanned.y;\n"
+"	wHistogram1[4*lIdx.x+2] = scanned.z;\n"
+"	wHistogram1[4*lIdx.x+3] = scanned.w;\n"
+"}\n"
+"*/\n"
+"\n"
+"//	for MAX_WG_SIZE 80\n"
+"//	can hold up to WG_SIZE*12 (128*12 > 80*16 )\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void PrefixScanKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"\n"
+"	uint data[12] = {0,0,0,0,0,0,0,0,0,0,0,0};\n"
+"	for(int i=0; i<12; i++)\n"
+"	{\n"
+"		if( int(12*lIdx+i) < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
+"			data[i] = wHistogram1[12*lIdx+i];\n"
+"	}\n"
+"\n"
+"	uint4 myData = uint4(0,0,0,0);\n"
+"	myData.x = data[0] + data[1];\n"
+"	myData.y = data[2] + data[3];\n"
+"	myData.z = data[4] + data[5];\n"
+"	myData.w = data[6] + data[7];\n"
+"\n"
+"\n"
+"	uint totalSum;\n"
+"	uint4 scanned = localPrefixSum128V( myData, lIdx, totalSum );\n"
+"\n"
+"	data[11] = scanned.w + data[9] + data[10];\n"
+"	data[10] = scanned.w + data[9];\n"
+"	data[9] = scanned.w;\n"
+"	data[8] = scanned.z + data[6] + data[7];\n"
+"	data[7] = scanned.z + data[6];\n"
+"	data[6] = scanned.z;\n"
+"	data[5] = scanned.y + data[3] + data[4];\n"
+"	data[4] = scanned.y + data[3];\n"
+"	data[3] = scanned.y;\n"
+"	data[2] = scanned.x + data[0] + data[1];\n"
+"	data[1] = scanned.x + data[0];\n"
+"	data[0] = scanned.x;\n"
+"\n"
+"	for(int i=0; i<12; i++)\n"
+"	{\n"
+"		wHistogram1[12*lIdx+i] = data[i];\n"
+"	}\n"
+"}\n"
+"/*\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void PrefixScanKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"\n"
+"	uint data[8] = {0,0,0,0,0,0,0,0};\n"
+"	for(int i=0; i<8; i++)\n"
+"	{\n"
+"		if( int(8*lIdx+i) < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
+"			data[i] = wHistogram1[8*lIdx+i];\n"
+"	}\n"
+"\n"
+"	uint4 myData = uint4(0,0,0,0);\n"
+"	myData.x = data[0] + data[1];\n"
+"	myData.y = data[2] + data[3];\n"
+"	myData.z = data[4] + data[5];\n"
+"	myData.w = data[6] + data[7];\n"
+"\n"
+"\n"
+"	uint totalSum;\n"
+"	uint4 scanned = localPrefixSum128V( myData, lIdx, totalSum );\n"
+"\n"
+"	data[7] = scanned.w + data[6];\n"
+"	data[6] = scanned.w;// + data[5];\n"
+"	data[5] = scanned.z + data[4];\n"
+"	data[4] = scanned.z;// + data[3];\n"
+"	data[3] = scanned.y + data[2];\n"
+"	data[2] = scanned.y;// + data[1];\n"
+"	data[1] = scanned.x + data[0];\n"
+"	data[0] = scanned.x;\n"
+"\n"
+"	for(int i=0; i<8; i++)\n"
+"	{\n"
+"		wHistogram1[8*lIdx+i] = data[i];\n"
+"	}\n"
+"}\n"
+"*/\n"
+"\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void CopyKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"\n"
+"	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
+"	{\n"
+"		KeyValuePair myData[4];\n"
+"		uint startAddrBlock;\n"
+"		{	//	read data\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"			startAddrBlock = lIdx*4;\n"
+"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
+"\n"
+"			myData[0] = dataToSort[startAddress+0];\n"
+"			myData[1] = dataToSort[startAddress+1];\n"
+"			myData[2] = dataToSort[startAddress+2];\n"
+"			myData[3] = dataToSort[startAddress+3];\n"
+"		}\n"
+"\n"
+"		{\n"
+"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
+"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
+"\n"
+"			dataToSortOut[startAddress+0] = myData[0];\n"
+"			dataToSortOut[startAddress+1] = myData[1];\n"
+"			dataToSortOut[startAddress+2] = myData[2];\n"
+"			dataToSortOut[startAddress+3] = myData[3];\n"
+"		}\n"
+"	}\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortHost.inl
@@ -0,0 +1,93 @@
+/*
+		2011 Takahiro Harada
+*/
+
+template<>
+class RadixSort<TYPE_HOST> : public RadixSortBase
+{
+	public:
+		struct Data
+		{
+			HostBuffer<SortData>* m_workBuffer;
+		};
+
+		enum
+		{
+			BITS_PER_PASS = 8, 
+			NUM_TABLES = (1<<BITS_PER_PASS),
+		};
+		
+		static
+		Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_STANDARD)
+		{
+			ADLASSERT( deviceData->m_type == TYPE_HOST );
+
+			Data* data = new Data;
+			data->m_workBuffer = new HostBuffer<SortData>( deviceData, maxSize );
+			return data;
+		}
+
+		static
+		void deallocate(Data* data)
+		{
+			delete data->m_workBuffer;
+			delete data;
+		}
+
+		static
+		void execute(Data* data, Buffer<SortData>& inout, int n, int sortBits = 32)
+		{
+			ADLASSERT( inout.getType() == TYPE_HOST );
+
+			int tables[NUM_TABLES];
+			int counter[NUM_TABLES];
+
+			SortData* src = inout.m_ptr;
+			SortData* dst = data->m_workBuffer->m_ptr;
+
+			int count=0;
+			for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
+			{
+				for(int i=0; i<NUM_TABLES; i++)
+				{
+					tables[i] = 0;
+				}
+
+				for(int i=0; i<n; i++)
+				{
+					int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
+					tables[tableIdx]++;
+				}
+
+				//	prefix scan
+				int sum = 0;
+				for(int i=0; i<NUM_TABLES; i++)
+				{
+					int iData = tables[i];
+					tables[i] = sum;
+					sum += iData;
+					counter[i] = 0;
+				}
+
+				//	distribute
+				for(int i=0; i<n; i++)
+				{
+					int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
+			
+					dst[tables[tableIdx] + counter[tableIdx]] = src[i];
+					counter[tableIdx] ++;
+				}
+
+				swap2( src, dst );
+				count++;
+			}
+
+			{
+				if (count&1)
+				//if( src != inout.m_ptr )
+				{
+					memcpy( dst, src, sizeof(SortData)*n );
+				}
+			}
+		}
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleCL.h
@@ -0,0 +1,134 @@
+static const char* radixSortSimpleKernelsCL = \
+	"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+	"\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"#define AtomInc(x) atom_inc(&(x))\n"
+	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+	"\n"
+	"\n"
+	"#define WG_SIZE 128\n"
+	"#define NUM_PER_WI 4\n"
+	"\n"
+	"\n"
+	"typedef struct\n"
+	"{\n"
+	"	u32 m_key;\n"
+	"	u32 m_value;\n"
+	"}SortData;\n"
+	"\n"
+	"\n"
+	"typedef struct\n"
+	"{\n"
+	"	u32 m_startBit;\n"
+	"	u32 m_numGroups;\n"
+	"	u32 m_padding[2];\n"
+	"} ConstBuffer;\n"
+	"\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"void LocalCountKernel(__global SortData* sortData,\n"
+	"						__global u32* ldsHistogramOut,\n"
+	"						ConstBuffer cb)\n"
+	"{\n"
+	"	__local u32 ldsHistogram[16][256];\n"
+	"\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	for(int i=0; i<16; i++)\n"
+	"	{\n"
+	"		ldsHistogram[i][lIdx] = 0.f;\n"
+	"		ldsHistogram[i][lIdx+128] = 0.f;\n"
+	"	}\n"
+	"\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"\n"
+	"	SortData datas[NUM_PER_WI];\n"
+	"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+	"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+	"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+	"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+	"\n"
+	"	datas[0].m_key = (datas[0].m_key >> cb.m_startBit) & 0xff;\n"
+	"	datas[1].m_key = (datas[1].m_key >> cb.m_startBit) & 0xff;\n"
+	"	datas[2].m_key = (datas[2].m_key >> cb.m_startBit) & 0xff;\n"
+	"	datas[3].m_key = (datas[3].m_key >> cb.m_startBit) & 0xff;\n"
+	"\n"
+	"	int tableIdx = lIdx%16;\n"
+	"\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);\n"
+	"\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"\n"
+	"	u32 sum0, sum1;\n"
+	"	sum0 = sum1 = 0;\n"
+	"	for(int i=0; i<16; i++)\n"
+	"	{\n"
+	"		sum0 += ldsHistogram[i][lIdx];\n"
+	"		sum1 += ldsHistogram[i][lIdx+128];\n"
+	"	}\n"
+	"\n"
+	"	ldsHistogramOut[lIdx*cb.m_numGroups+GET_GROUP_IDX] = sum0;\n"
+	"	ldsHistogramOut[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX] = sum1;\n"
+	"}\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"void ScatterKernel(__global SortData* sortData,\n"
+	"					__global SortData* sortDataOut,\n"
+	"					__global u32* scannedHistogram,\n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	__local u32 ldsCurrentLocation[256];\n"
+	"\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	{\n"
+	"		ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*cb.m_numGroups+GET_GROUP_IDX];\n"
+	"		ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX];\n"
+	"	}\n"
+	"\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"\n"
+	"	SortData datas[NUM_PER_WI];\n"
+	"	int keys[NUM_PER_WI];\n"
+	"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+	"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+	"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+	"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+	"\n"
+	"	keys[0] = (datas[0].m_key >> cb.m_startBit) & 0xff;\n"
+	"	keys[1] = (datas[1].m_key >> cb.m_startBit) & 0xff;\n"
+	"	keys[2] = (datas[2].m_key >> cb.m_startBit) & 0xff;\n"
+	"	keys[3] = (datas[3].m_key >> cb.m_startBit) & 0xff;\n"
+	"\n"
+	"	int dst[NUM_PER_WI];\n"
+	"	for(int i=0; i<WG_SIZE; i++)\n"
+	"	{\n"
+	"		if( i==lIdx )\n"
+	"		{\n"
+	"			AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);\n"
+	"			AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);\n"
+	"			AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);\n"
+	"			AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"	}\n"
+	"	sortDataOut[dst[0]] = datas[0];\n"
+	"	sortDataOut[dst[1]] = datas[1];\n"
+	"	sortDataOut[dst[2]] = datas[2];\n"
+	"	sortDataOut[dst[3]] = datas[3];\n"
+	"}\n"
+	"\n"
+	"";
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleDX11.h
@@ -0,0 +1,131 @@
+static const char* radixSortSimpleKernelsDX11 = \
+	"typedef uint u32;\n"
+	"\n"
+	"#define GET_GROUP_IDX groupIdx.x\n"
+	"#define GET_LOCAL_IDX localIdx.x\n"
+	"#define GET_GLOBAL_IDX globalIdx.x\n"
+	"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+	"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+	"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+	"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+	"\n"
+	"//	takahiro end\n"
+	"#define WG_SIZE 128\n"
+	"#define NUM_PER_WI 4\n"
+	"\n"
+	"#define GET_GROUP_SIZE WG_SIZE\n"
+	"\n"
+	"typedef struct\n"
+	"{\n"
+	"	u32 m_key;\n"
+	"	u32 m_value;\n"
+	"}SortData;\n"
+	"\n"
+	"cbuffer SortCB : register( b0 )\n"
+	"{\n"
+	"	u32 m_startBit;\n"
+	"	u32 m_numGroups;\n"
+	"	u32 m_padding[2];\n"
+	"};\n"
+	"\n"
+	"StructuredBuffer<SortData> sortData : register( t0 );\n"
+	"RWStructuredBuffer<u32> ldsHistogramOut : register( u0 );\n"
+	"\n"
+	"groupshared u32 ldsHistogram[16][256];\n"
+	"\n"
+	"[numthreads(WG_SIZE, 1, 1)]\n"
+	"void LocalCountKernel( DEFAULT_ARGS )\n"
+	"{\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	for(int i=0; i<16; i++)\n"
+	"	{\n"
+	"		ldsHistogram[i][lIdx] = 0.f;\n"
+	"		ldsHistogram[i][lIdx+128] = 0.f;\n"
+	"	}\n"
+	"\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"\n"
+	"	SortData datas[NUM_PER_WI];\n"
+	"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+	"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+	"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+	"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+	"\n"
+	"	datas[0].m_key = (datas[0].m_key >> m_startBit) & 0xff;\n"
+	"	datas[1].m_key = (datas[1].m_key >> m_startBit) & 0xff;\n"
+	"	datas[2].m_key = (datas[2].m_key >> m_startBit) & 0xff;\n"
+	"	datas[3].m_key = (datas[3].m_key >> m_startBit) & 0xff;\n"
+	"\n"
+	"	int tableIdx = lIdx%16;\n"
+	"\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);\n"
+	"	AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);\n"
+	"\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"\n"
+	"	u32 sum0, sum1;\n"
+	"	sum0 = sum1 = 0;\n"
+	"	for(int i=0; i<16; i++)\n"
+	"	{\n"
+	"		sum0 += ldsHistogram[i][lIdx];\n"
+	"		sum1 += ldsHistogram[i][lIdx+128];\n"
+	"	}\n"
+	"\n"
+	"	ldsHistogramOut[lIdx*m_numGroups+GET_GROUP_IDX] = sum0;\n"
+	"	ldsHistogramOut[(lIdx+128)*m_numGroups+GET_GROUP_IDX] = sum1;\n"
+	"}\n"
+	"\n"
+	"\n"
+	"RWStructuredBuffer<SortData> sortDataOut : register( u0 );\n"
+	"RWStructuredBuffer<u32> scannedHistogram : register( u1 );\n"
+	"\n"
+	"groupshared u32 ldsCurrentLocation[256];\n"
+	"\n"
+	"[numthreads(WG_SIZE, 1, 1)]\n"
+	"void ScatterKernel( DEFAULT_ARGS )\n"
+	"{\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	{\n"
+	"		ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*m_numGroups+GET_GROUP_IDX];\n"
+	"		ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*m_numGroups+GET_GROUP_IDX];\n"
+	"	}\n"
+	"\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"\n"
+	"	SortData datas[NUM_PER_WI];\n"
+	"	int keys[NUM_PER_WI];\n"
+	"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+	"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+	"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+	"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+	"\n"
+	"	keys[0] = (datas[0].m_key >> m_startBit) & 0xff;\n"
+	"	keys[1] = (datas[1].m_key >> m_startBit) & 0xff;\n"
+	"	keys[2] = (datas[2].m_key >> m_startBit) & 0xff;\n"
+	"	keys[3] = (datas[3].m_key >> m_startBit) & 0xff;\n"
+	"\n"
+	"	int dst[NUM_PER_WI];\n"
+	"	for(int i=0; i<WG_SIZE; i++)\n"
+	"//	for(int i=0; i<m_padding[0]; i++)	//	to reduce compile time\n"
+	"	{\n"
+	"		if( i==lIdx )\n"
+	"		{\n"
+	"			AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);\n"
+	"			AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);\n"
+	"			AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);\n"
+	"			AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"	}\n"
+	"	sortDataOut[dst[0]] = datas[0];\n"
+	"	sortDataOut[dst[1]] = datas[1];\n"
+	"	sortDataOut[dst[2]] = datas[2];\n"
+	"	sortDataOut[dst[3]] = datas[3];\n"
+	"}\n"
+	"";
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernels.cl
@@ -0,0 +1,147 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Author Takahiro Harada
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+
+
+#define WG_SIZE 128
+#define NUM_PER_WI 4
+
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+
+typedef struct
+{
+	u32 m_startBit;
+	u32 m_numGroups;
+	u32 m_padding[2];
+} ConstBuffer;
+
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void LocalCountKernel(__global SortData* sortData, 
+						__global u32* ldsHistogramOut,
+						ConstBuffer cb)
+{
+	__local u32 ldsHistogram[16][256];
+
+	int lIdx = GET_LOCAL_IDX;
+	int gIdx = GET_GLOBAL_IDX;
+	
+	for(int i=0; i<16; i++)
+	{
+		ldsHistogram[i][lIdx] = 0.f;
+		ldsHistogram[i][lIdx+128] = 0.f;
+	}
+	
+	GROUP_LDS_BARRIER;
+	
+	SortData datas[NUM_PER_WI];
+	datas[0] = sortData[gIdx*NUM_PER_WI+0];
+	datas[1] = sortData[gIdx*NUM_PER_WI+1];
+	datas[2] = sortData[gIdx*NUM_PER_WI+2];
+	datas[3] = sortData[gIdx*NUM_PER_WI+3];
+
+	datas[0].m_key = (datas[0].m_key >> cb.m_startBit) & 0xff;
+	datas[1].m_key = (datas[1].m_key >> cb.m_startBit) & 0xff;
+	datas[2].m_key = (datas[2].m_key >> cb.m_startBit) & 0xff;
+	datas[3].m_key = (datas[3].m_key >> cb.m_startBit) & 0xff;
+
+	int tableIdx = lIdx%16;
+	
+	AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);
+	AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);
+	AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);
+	AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);
+
+	GROUP_LDS_BARRIER;
+	
+	u32 sum0, sum1;
+	sum0 = sum1 = 0;
+	for(int i=0; i<16; i++)
+	{
+		sum0 += ldsHistogram[i][lIdx];
+		sum1 += ldsHistogram[i][lIdx+128];
+	}
+
+	ldsHistogramOut[lIdx*cb.m_numGroups+GET_GROUP_IDX] = sum0;
+	ldsHistogramOut[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX] = sum1;
+}
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void ScatterKernel(__global SortData* sortData,
+					__global SortData* sortDataOut,
+					__global u32* scannedHistogram, 
+					ConstBuffer cb)
+{
+	__local u32 ldsCurrentLocation[256];
+
+	int lIdx = GET_LOCAL_IDX;
+	int gIdx = GET_GLOBAL_IDX;
+	
+	{
+		ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*cb.m_numGroups+GET_GROUP_IDX];
+		ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX];
+	}
+
+	GROUP_LDS_BARRIER;
+	
+	SortData datas[NUM_PER_WI];
+	int keys[NUM_PER_WI];
+	datas[0] = sortData[gIdx*NUM_PER_WI+0];
+	datas[1] = sortData[gIdx*NUM_PER_WI+1];
+	datas[2] = sortData[gIdx*NUM_PER_WI+2];
+	datas[3] = sortData[gIdx*NUM_PER_WI+3];
+
+	keys[0] = (datas[0].m_key >> cb.m_startBit) & 0xff;
+	keys[1] = (datas[1].m_key >> cb.m_startBit) & 0xff;
+	keys[2] = (datas[2].m_key >> cb.m_startBit) & 0xff;
+	keys[3] = (datas[3].m_key >> cb.m_startBit) & 0xff;
+
+	int dst[NUM_PER_WI];
+	for(int i=0; i<WG_SIZE; i++)
+	{
+		if( i==lIdx )
+		{
+			AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);
+			AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);
+			AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);
+			AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);
+		}
+		GROUP_LDS_BARRIER;
+	}
+	sortDataOut[dst[0]] = datas[0];
+	sortDataOut[dst[1]] = datas[1];
+	sortDataOut[dst[2]] = datas[2];
+	sortDataOut[dst[3]] = datas[3];
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernels.hlsl
@@ -0,0 +1,133 @@
+/*
+		2011 Takahiro Harada
+*/
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
+#define AtomInc(x) InterlockedAdd(x, 1)
+#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
+
+//	takahiro end
+#define WG_SIZE 128
+#define NUM_PER_WI 4
+
+#define GET_GROUP_SIZE WG_SIZE
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+cbuffer SortCB : register( b0 )
+{
+	u32 m_startBit;
+	u32 m_numGroups;
+	u32 m_padding[2];
+};
+ 
+StructuredBuffer<SortData> sortData : register( t0 );
+RWStructuredBuffer<u32> ldsHistogramOut : register( u0 );
+
+groupshared u32 ldsHistogram[16][256];
+
+[numthreads(WG_SIZE, 1, 1)]
+void LocalCountKernel( DEFAULT_ARGS )
+{
+	int lIdx = GET_LOCAL_IDX;
+	int gIdx = GET_GLOBAL_IDX;
+	
+	for(int i=0; i<16; i++)
+	{
+		ldsHistogram[i][lIdx] = 0.f;
+		ldsHistogram[i][lIdx+128] = 0.f;
+	}
+	
+	GROUP_LDS_BARRIER;
+	
+	SortData datas[NUM_PER_WI];
+	datas[0] = sortData[gIdx*NUM_PER_WI+0];
+	datas[1] = sortData[gIdx*NUM_PER_WI+1];
+	datas[2] = sortData[gIdx*NUM_PER_WI+2];
+	datas[3] = sortData[gIdx*NUM_PER_WI+3];
+
+	datas[0].m_key = (datas[0].m_key >> m_startBit) & 0xff;
+	datas[1].m_key = (datas[1].m_key >> m_startBit) & 0xff;
+	datas[2].m_key = (datas[2].m_key >> m_startBit) & 0xff;
+	datas[3].m_key = (datas[3].m_key >> m_startBit) & 0xff;
+
+	int tableIdx = lIdx%16;
+	
+	AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);
+	AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);
+	AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);
+	AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);
+
+	GROUP_LDS_BARRIER;
+	
+	u32 sum0, sum1;
+	sum0 = sum1 = 0;
+	for(int i=0; i<16; i++)
+	{
+		sum0 += ldsHistogram[i][lIdx];
+		sum1 += ldsHistogram[i][lIdx+128];
+	}
+
+	ldsHistogramOut[lIdx*m_numGroups+GET_GROUP_IDX] = sum0;
+	ldsHistogramOut[(lIdx+128)*m_numGroups+GET_GROUP_IDX] = sum1;
+}
+
+
+RWStructuredBuffer<SortData> sortDataOut : register( u0 );
+RWStructuredBuffer<u32> scannedHistogram : register( u1 );
+
+groupshared u32 ldsCurrentLocation[256];
+
+[numthreads(WG_SIZE, 1, 1)]
+void ScatterKernel( DEFAULT_ARGS )
+{
+	int lIdx = GET_LOCAL_IDX;
+	int gIdx = GET_GLOBAL_IDX;
+	
+	{
+		ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*m_numGroups+GET_GROUP_IDX];
+		ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*m_numGroups+GET_GROUP_IDX];
+	}
+
+	GROUP_LDS_BARRIER;
+	
+	SortData datas[NUM_PER_WI];
+	int keys[NUM_PER_WI];
+	datas[0] = sortData[gIdx*NUM_PER_WI+0];
+	datas[1] = sortData[gIdx*NUM_PER_WI+1];
+	datas[2] = sortData[gIdx*NUM_PER_WI+2];
+	datas[3] = sortData[gIdx*NUM_PER_WI+3];
+
+	keys[0] = (datas[0].m_key >> m_startBit) & 0xff;
+	keys[1] = (datas[1].m_key >> m_startBit) & 0xff;
+	keys[2] = (datas[2].m_key >> m_startBit) & 0xff;
+	keys[3] = (datas[3].m_key >> m_startBit) & 0xff;
+
+	int dst[NUM_PER_WI];
+	for(int i=0; i<WG_SIZE; i++)
+//	for(int i=0; i<m_padding[0]; i++)	//	to reduce compile time
+	{
+		if( i==lIdx )
+		{
+			AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);
+			AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);
+			AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);
+			AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);
+		}
+		GROUP_LDS_BARRIER;
+	}
+	sortDataOut[dst[0]] = datas[0];
+	sortDataOut[dst[1]] = datas[1];
+	sortDataOut[dst[2]] = datas[2];
+	sortDataOut[dst[3]] = datas[3];
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernelsCL.h
@@ -0,0 +1,149 @@
+static const char* radixSortSimpleKernelsCL= \
+"/*\n"
+"Bullet Continuous Collision Detection and Physics Library\n"
+"Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org\n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Author Takahiro Harada\n"
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"\n"
+"\n"
+"#define WG_SIZE 128\n"
+"#define NUM_PER_WI 4\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_startBit;\n"
+"	u32 m_numGroups;\n"
+"	u32 m_padding[2];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void LocalCountKernel(__global SortData* sortData, \n"
+"						__global u32* ldsHistogramOut,\n"
+"						ConstBuffer cb)\n"
+"{\n"
+"	__local u32 ldsHistogram[16][256];\n"
+"\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	\n"
+"	for(int i=0; i<16; i++)\n"
+"	{\n"
+"		ldsHistogram[i][lIdx] = 0.f;\n"
+"		ldsHistogram[i][lIdx+128] = 0.f;\n"
+"	}\n"
+"	\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	SortData datas[NUM_PER_WI];\n"
+"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+"\n"
+"	datas[0].m_key = (datas[0].m_key >> cb.m_startBit) & 0xff;\n"
+"	datas[1].m_key = (datas[1].m_key >> cb.m_startBit) & 0xff;\n"
+"	datas[2].m_key = (datas[2].m_key >> cb.m_startBit) & 0xff;\n"
+"	datas[3].m_key = (datas[3].m_key >> cb.m_startBit) & 0xff;\n"
+"\n"
+"	int tableIdx = lIdx%16;\n"
+"	\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	u32 sum0, sum1;\n"
+"	sum0 = sum1 = 0;\n"
+"	for(int i=0; i<16; i++)\n"
+"	{\n"
+"		sum0 += ldsHistogram[i][lIdx];\n"
+"		sum1 += ldsHistogram[i][lIdx+128];\n"
+"	}\n"
+"\n"
+"	ldsHistogramOut[lIdx*cb.m_numGroups+GET_GROUP_IDX] = sum0;\n"
+"	ldsHistogramOut[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX] = sum1;\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void ScatterKernel(__global SortData* sortData,\n"
+"					__global SortData* sortDataOut,\n"
+"					__global u32* scannedHistogram, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	__local u32 ldsCurrentLocation[256];\n"
+"\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	\n"
+"	{\n"
+"		ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*cb.m_numGroups+GET_GROUP_IDX];\n"
+"		ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	SortData datas[NUM_PER_WI];\n"
+"	int keys[NUM_PER_WI];\n"
+"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+"\n"
+"	keys[0] = (datas[0].m_key >> cb.m_startBit) & 0xff;\n"
+"	keys[1] = (datas[1].m_key >> cb.m_startBit) & 0xff;\n"
+"	keys[2] = (datas[2].m_key >> cb.m_startBit) & 0xff;\n"
+"	keys[3] = (datas[3].m_key >> cb.m_startBit) & 0xff;\n"
+"\n"
+"	int dst[NUM_PER_WI];\n"
+"	for(int i=0; i<WG_SIZE; i++)\n"
+"	{\n"
+"		if( i==lIdx )\n"
+"		{\n"
+"			AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);\n"
+"			AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);\n"
+"			AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);\n"
+"			AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);\n"
+"		}\n"
+"		GROUP_LDS_BARRIER;\n"
+"	}\n"
+"	sortDataOut[dst[0]] = datas[0];\n"
+"	sortDataOut[dst[1]] = datas[1];\n"
+"	sortDataOut[dst[2]] = datas[2];\n"
+"	sortDataOut[dst[3]] = datas[3];\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortSimpleKernelsDX11.h
@@ -0,0 +1,135 @@
+static const char* radixSortSimpleKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+"\n"
+"//	takahiro end\n"
+"#define WG_SIZE 128\n"
+"#define NUM_PER_WI 4\n"
+"\n"
+"#define GET_GROUP_SIZE WG_SIZE\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"cbuffer SortCB : register( b0 )\n"
+"{\n"
+"	u32 m_startBit;\n"
+"	u32 m_numGroups;\n"
+"	u32 m_padding[2];\n"
+"};\n"
+" \n"
+"StructuredBuffer<SortData> sortData : register( t0 );\n"
+"RWStructuredBuffer<u32> ldsHistogramOut : register( u0 );\n"
+"\n"
+"groupshared u32 ldsHistogram[16][256];\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void LocalCountKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	\n"
+"	for(int i=0; i<16; i++)\n"
+"	{\n"
+"		ldsHistogram[i][lIdx] = 0.f;\n"
+"		ldsHistogram[i][lIdx+128] = 0.f;\n"
+"	}\n"
+"	\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	SortData datas[NUM_PER_WI];\n"
+"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+"\n"
+"	datas[0].m_key = (datas[0].m_key >> m_startBit) & 0xff;\n"
+"	datas[1].m_key = (datas[1].m_key >> m_startBit) & 0xff;\n"
+"	datas[2].m_key = (datas[2].m_key >> m_startBit) & 0xff;\n"
+"	datas[3].m_key = (datas[3].m_key >> m_startBit) & 0xff;\n"
+"\n"
+"	int tableIdx = lIdx%16;\n"
+"	\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);\n"
+"	AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	u32 sum0, sum1;\n"
+"	sum0 = sum1 = 0;\n"
+"	for(int i=0; i<16; i++)\n"
+"	{\n"
+"		sum0 += ldsHistogram[i][lIdx];\n"
+"		sum1 += ldsHistogram[i][lIdx+128];\n"
+"	}\n"
+"\n"
+"	ldsHistogramOut[lIdx*m_numGroups+GET_GROUP_IDX] = sum0;\n"
+"	ldsHistogramOut[(lIdx+128)*m_numGroups+GET_GROUP_IDX] = sum1;\n"
+"}\n"
+"\n"
+"\n"
+"RWStructuredBuffer<SortData> sortDataOut : register( u0 );\n"
+"RWStructuredBuffer<u32> scannedHistogram : register( u1 );\n"
+"\n"
+"groupshared u32 ldsCurrentLocation[256];\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void ScatterKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	\n"
+"	{\n"
+"		ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*m_numGroups+GET_GROUP_IDX];\n"
+"		ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*m_numGroups+GET_GROUP_IDX];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	SortData datas[NUM_PER_WI];\n"
+"	int keys[NUM_PER_WI];\n"
+"	datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
+"	datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
+"	datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
+"	datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
+"\n"
+"	keys[0] = (datas[0].m_key >> m_startBit) & 0xff;\n"
+"	keys[1] = (datas[1].m_key >> m_startBit) & 0xff;\n"
+"	keys[2] = (datas[2].m_key >> m_startBit) & 0xff;\n"
+"	keys[3] = (datas[3].m_key >> m_startBit) & 0xff;\n"
+"\n"
+"	int dst[NUM_PER_WI];\n"
+"	for(int i=0; i<WG_SIZE; i++)\n"
+"//	for(int i=0; i<m_padding[0]; i++)	//	to reduce compile time\n"
+"	{\n"
+"		if( i==lIdx )\n"
+"		{\n"
+"			AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);\n"
+"			AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);\n"
+"			AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);\n"
+"			AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);\n"
+"		}\n"
+"		GROUP_LDS_BARRIER;\n"
+"	}\n"
+"	sortDataOut[dst[0]] = datas[0];\n"
+"	sortDataOut[dst[1]] = datas[1];\n"
+"	sortDataOut[dst[2]] = datas[2];\n"
+"	sortDataOut[dst[3]] = datas[3];\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandard.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandard.inl
@@ -0,0 +1,177 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Sort\\RadixSortStandardKernels"
+#define KERNEL0 "LocalSortKernel"
+#define KERNEL1 "ScatterKernel"
+#define KERNEL2 "CopyKernel"
+
+#include <AdlPrimitives/Sort/RadixSortStandardKernelsCL.h>
+#include <AdlPrimitives/Sort/RadixSortStandardKernelsDX11.h>
+
+template<DeviceType type>
+class RadixSortStandard : public RadixSortBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		enum
+		{
+			WG_SIZE = 128,
+			NUM_PER_WI = 4,
+
+			BITS_PER_PASS = 4,
+		};
+
+		struct Data : public RadixSort<type>::Data
+		{
+			Kernel* m_localSortKernel;
+			Kernel* m_scatterKernel;
+			Kernel* m_copyKernel;
+
+			Buffer<u32>* m_workBuffer0;
+			Buffer<u32>* m_workBuffer1;
+			Buffer<u32>* m_workBuffer2;
+			Buffer<SortData>* m_workBuffer3;
+			Buffer<int4>* m_constBuffer[32/BITS_PER_PASS];
+		};
+		
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_NORMAL);
+
+		static
+		void deallocate(void* data);
+
+		static
+		void execute(void* data, Buffer<SortData>& inout, int n, int sortBits);
+};
+
+template<DeviceType type>
+typename RadixSortStandard<type>::Data* RadixSortStandard<type>::allocate(const Device* deviceData, int maxSize, Option option)
+{
+	ADLASSERT( type == deviceData->m_type );
+
+	u32 maxNumGroups = (maxSize+WG_SIZE*NUM_PER_WI-1)/(WG_SIZE*NUM_PER_WI);
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+	{radixSortStandardKernelsCL,radixSortStandardKernelsDX11};
+//	ADLASSERT(0);
+#else
+	{0,0};
+#endif	
+
+	Data* data = new Data;
+	data->m_option = option;
+	data->m_deviceData = deviceData;
+
+	data->m_localSortKernel = deviceData->getKernel( PATH, KERNEL0, 0, src[type] );
+	data->m_scatterKernel = deviceData->getKernel( PATH, KERNEL1, 0, src[type] );
+	data->m_copyKernel = deviceData->getKernel( PATH, KERNEL2, 0, src[type] );
+
+	//	is this correct?
+	data->m_scanData = PrefixScan<type>::allocate( deviceData, maxNumGroups*(1<<BITS_PER_PASS) );
+
+	data->m_workBuffer0 = new Buffer<u32>( deviceData, maxNumGroups*(1<<BITS_PER_PASS) );
+	data->m_workBuffer1 = new Buffer<u32>( deviceData, maxNumGroups*(1<<BITS_PER_PASS) );
+	data->m_workBuffer2 = new Buffer<u32>( deviceData, maxNumGroups*(1<<BITS_PER_PASS) );
+	data->m_workBuffer3 = new Buffer<SortData>( deviceData, maxSize );
+	for(int i=0; i<32/BITS_PER_PASS; i++)
+		data->m_constBuffer[i] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
+	data->m_maxSize = maxSize;
+
+	return data;
+}
+
+template<DeviceType type>
+void RadixSortStandard<type>::deallocate(void* rawData)
+{
+	Data* data = (Data*)rawData;
+
+	delete data->m_workBuffer0;
+	delete data->m_workBuffer1;
+	delete data->m_workBuffer2;
+	delete data->m_workBuffer3;
+	for(int i=0; i<32/BITS_PER_PASS; i++)
+		delete data->m_constBuffer[i];
+	
+	PrefixScan<type>::deallocate( data->m_scanData );
+
+	delete data;
+}
+
+template<DeviceType type>
+void RadixSortStandard<type>::execute(void* rawData, Buffer<SortData>& inout, int n, int sortBits)
+{
+	Data* data = (Data*)rawData;
+
+	ADLASSERT( n%512 == 0 );
+	ADLASSERT( n <= data->m_maxSize );
+	ADLASSERT( NUM_PER_WI == 4 );
+
+	Buffer<SortData>* src = BufferUtils::map<type, true>( data->m_deviceData, &inout );
+	Buffer<SortData>* dst = data->m_workBuffer3;
+
+	const Device* deviceData = data->m_deviceData;
+
+	int numGroups = (n+WG_SIZE*NUM_PER_WI-1)/(WG_SIZE*NUM_PER_WI);
+
+	int4 constBuffer;
+
+	int iPass = 0;
+	for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS, iPass++)
+	{
+		constBuffer.x = startBit;
+		constBuffer.y = numGroups;
+		constBuffer.z = WG_SIZE;
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( src ), BufferInfo( data->m_workBuffer0 ), BufferInfo( data->m_workBuffer1 ) };
+
+			Launcher launcher( deviceData, data->m_localSortKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
+			launcher.launch1D( WG_SIZE*numGroups, WG_SIZE );
+		}
+
+		PrefixScan<type>::execute( data->m_scanData, *data->m_workBuffer0, *data->m_workBuffer2, numGroups*(1<<BITS_PER_PASS) );
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( data->m_workBuffer2, true ), BufferInfo( data->m_workBuffer1, true ),
+				BufferInfo( dst ) };
+
+			Launcher launcher( deviceData, data->m_scatterKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
+			launcher.launch1D( WG_SIZE*numGroups, WG_SIZE );
+		}
+
+		if(0)
+		{
+			BufferInfo bInfo[] = { BufferInfo( dst, true ), BufferInfo( src ) };
+
+			Launcher launcher( deviceData, data->m_copyKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.launch1D( n, WG_SIZE );
+		}
+		swap2( src, dst );
+	}
+
+	if( src != &inout )
+	{
+		BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( dst ) };
+
+		Launcher launcher( deviceData, data->m_copyKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.launch1D( n, WG_SIZE );
+	}
+
+	BufferUtils::unmap<true>( src, &inout );
+}
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernels.cl
@@ -0,0 +1,345 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Author Takahiro Harada
+
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+
+#define make_uint4 (uint4)
+#define make_uint2 (uint2)
+
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+
+#define WG_SIZE 128
+#define NUM_PER_WI 4
+
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+
+typedef struct
+{
+	u32 m_startBit;
+	u32 m_numGroups;
+	u32 m_padding[2];
+} ConstBuffer;
+
+#define BITS_PER_PASS 4
+
+
+
+uint4 prefixScanVector( uint4 data )
+{
+	data.y += data.x;
+	data.w += data.z;
+	data.z += data.y;
+	data.w += data.y;
+	return data;
+}
+
+uint prefixScanVectorEx( uint4* data )
+{
+	uint4 backup = data[0];
+	data[0].y += data[0].x;
+	data[0].w += data[0].z;
+	data[0].z += data[0].y;
+	data[0].w += data[0].y;
+	uint sum = data[0].w;
+	*data -= backup;
+	return sum;
+}
+
+uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32 sorterSharedMemory[] )
+{
+	{	//	Set data
+		sorterSharedMemory[lIdx] = 0;
+		sorterSharedMemory[lIdx+WG_SIZE] = prefixScanVectorEx( &pData );
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	Prefix sum
+		int idx = 2*lIdx + (WG_SIZE+1);
+		if( lIdx < 64 )
+		{
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];					
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		
+			GROUP_MEM_FENCE;
+			sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
+			GROUP_MEM_FENCE;
+
+			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
+			GROUP_MEM_FENCE;
+		}
+	}
+
+	GROUP_LDS_BARRIER;
+
+	*totalSum = sorterSharedMemory[WG_SIZE*2-1];
+	uint addValue = sorterSharedMemory[lIdx+127];
+	return pData + make_uint4(addValue, addValue, addValue, addValue);
+}
+
+
+void generateHistogram(u32 lIdx, u32 wgIdx, 
+		uint4 sortedData,
+		__local u32 *histogram)
+{
+    if( lIdx < (1<<BITS_PER_PASS) )
+    {
+    	histogram[lIdx] = 0;
+    }
+
+	int mask = ((1<<BITS_PER_PASS)-1);
+	uint4 keys = make_uint4( (sortedData.x)&mask, (sortedData.y)&mask, (sortedData.z)&mask, (sortedData.w)&mask );
+
+	GROUP_LDS_BARRIER;
+	
+	AtomInc( histogram[keys.x] );
+	AtomInc( histogram[keys.y] );
+	AtomInc( histogram[keys.z] );
+	AtomInc( histogram[keys.w] );
+}
+
+//
+//
+//
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void LocalSortKernel(__global SortData* sortDataIn, 
+						__global u32* ldsHistogramOut0,
+						__global u32* ldsHistogramOut1,
+						ConstBuffer cb)
+{
+
+	__local u32 ldsSortData[ WG_SIZE*NUM_PER_WI + 16 ];
+
+	int nElemsPerWG = WG_SIZE*NUM_PER_WI;
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+
+    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
+
+
+	SortData sortData[NUM_PER_WI];
+
+	{
+		u32 offset = nElemsPerWG*wgIdx;
+		sortData[0] = sortDataIn[offset+localAddr.x];
+		sortData[1] = sortDataIn[offset+localAddr.y];
+		sortData[2] = sortDataIn[offset+localAddr.z];
+		sortData[3] = sortDataIn[offset+localAddr.w];
+	}
+
+	int bitIdx = cb.m_startBit;
+	do
+	{
+//	what is this?
+//		if( lIdx == wgSize-1 ) ldsSortData[256] = sortData[3].m_key;
+		u32 mask = (1<<bitIdx);
+		uint4 cmpResult = make_uint4( sortData[0].m_key & mask, sortData[1].m_key & mask, sortData[2].m_key & mask, sortData[3].m_key & mask );
+		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );
+		u32 total;
+		prefixSum = localPrefixSum128V( prefixSum, lIdx, &total, ldsSortData );
+
+		{
+			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );
+			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );
+
+			GROUP_LDS_BARRIER;
+
+			ldsSortData[dstAddr.x] = sortData[0].m_key;
+			ldsSortData[dstAddr.y] = sortData[1].m_key;
+			ldsSortData[dstAddr.z] = sortData[2].m_key;
+			ldsSortData[dstAddr.w] = sortData[3].m_key;
+
+			GROUP_LDS_BARRIER;
+
+			sortData[0].m_key = ldsSortData[localAddr.x];
+			sortData[1].m_key = ldsSortData[localAddr.y];
+			sortData[2].m_key = ldsSortData[localAddr.z];
+			sortData[3].m_key = ldsSortData[localAddr.w];
+
+			GROUP_LDS_BARRIER;
+
+			ldsSortData[dstAddr.x] = sortData[0].m_value;
+			ldsSortData[dstAddr.y] = sortData[1].m_value;
+			ldsSortData[dstAddr.z] = sortData[2].m_value;
+			ldsSortData[dstAddr.w] = sortData[3].m_value;
+
+			GROUP_LDS_BARRIER;
+
+			sortData[0].m_value = ldsSortData[localAddr.x];
+			sortData[1].m_value = ldsSortData[localAddr.y];
+			sortData[2].m_value = ldsSortData[localAddr.z];
+			sortData[3].m_value = ldsSortData[localAddr.w];
+
+			GROUP_LDS_BARRIER;
+		}
+		bitIdx ++;
+	}
+	while( bitIdx <(cb.m_startBit+BITS_PER_PASS) );
+
+	{	//	generate historgram
+		uint4 localKeys = make_uint4( sortData[0].m_key>>cb.m_startBit, sortData[1].m_key>>cb.m_startBit, 
+			sortData[2].m_key>>cb.m_startBit, sortData[3].m_key>>cb.m_startBit );
+
+		generateHistogram( lIdx, wgIdx, localKeys, ldsSortData );
+
+		GROUP_LDS_BARRIER;
+
+		int nBins = (1<<BITS_PER_PASS);
+		if( lIdx < nBins )
+		{
+     		u32 histValues = ldsSortData[lIdx];
+
+     		u32 globalAddresses = nBins*wgIdx + lIdx;
+     		u32 globalAddressesRadixMajor = cb.m_numGroups*lIdx + wgIdx;
+		
+     		ldsHistogramOut0[globalAddressesRadixMajor] = histValues;
+     		ldsHistogramOut1[globalAddresses] = histValues;
+		}
+	}
+
+
+	{	//	write
+		u32 offset = nElemsPerWG*wgIdx;
+		uint4 dstAddr = make_uint4(offset+localAddr.x, offset+localAddr.y, offset+localAddr.z, offset+localAddr.w );
+
+		sortDataIn[ dstAddr.x + 0 ] = sortData[0];
+		sortDataIn[ dstAddr.x + 1 ] = sortData[1];
+		sortDataIn[ dstAddr.x + 2 ] = sortData[2];
+		sortDataIn[ dstAddr.x + 3 ] = sortData[3];
+	}
+}
+
+
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void ScatterKernel(__global SortData *src,
+		__global u32 *histogramGlobalRadixMajor,
+		__global u32 *histogramLocalGroupMajor,
+		__global SortData *dst,
+		ConstBuffer cb)
+{
+	__local u32 sorterLocalMemory[3*(1<<BITS_PER_PASS)];
+	__local u32 *ldsLocalHistogram = sorterLocalMemory + (1<<BITS_PER_PASS);
+	__local u32 *ldsGlobalHistogram = sorterLocalMemory;
+
+
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 ldsOffset = (1<<BITS_PER_PASS);
+
+	//	load and prefix scan local histogram
+	if( lIdx < ((1<<BITS_PER_PASS)/2) )
+	{
+		uint2 myIdx = make_uint2(lIdx, lIdx+8);
+
+		ldsLocalHistogram[ldsOffset+myIdx.x] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.x];
+		ldsLocalHistogram[ldsOffset+myIdx.y] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.y];
+		ldsLocalHistogram[ldsOffset+myIdx.x-(1<<BITS_PER_PASS)] = 0;
+		ldsLocalHistogram[ldsOffset+myIdx.y-(1<<BITS_PER_PASS)] = 0;
+
+		int idx = ldsOffset+2*lIdx;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-1];
+		GROUP_MEM_FENCE;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-2];
+		GROUP_MEM_FENCE;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-4];
+		GROUP_MEM_FENCE;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-8];
+		GROUP_MEM_FENCE;
+
+		// Propagate intermediate values through
+		ldsLocalHistogram[idx-1] += ldsLocalHistogram[idx-2];
+		GROUP_MEM_FENCE;
+
+		// Grab and propagate for whole WG - loading the - 1 value
+		uint2 localValues;
+		localValues.x = ldsLocalHistogram[ldsOffset+myIdx.x-1];
+		localValues.y = ldsLocalHistogram[ldsOffset+myIdx.y-1];
+
+		ldsLocalHistogram[myIdx.x] = localValues.x;
+		ldsLocalHistogram[myIdx.y] = localValues.y;
+
+
+		ldsGlobalHistogram[myIdx.x] = histogramGlobalRadixMajor[cb.m_numGroups*myIdx.x + wgIdx];
+		ldsGlobalHistogram[myIdx.y] = histogramGlobalRadixMajor[cb.m_numGroups*myIdx.y + wgIdx];
+	}
+
+	GROUP_LDS_BARRIER;
+
+    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
+
+	SortData sortData[4];
+	{
+	    uint4 globalAddr = wgIdx*WG_SIZE*NUM_PER_WI + localAddr;
+		sortData[0] = src[globalAddr.x];
+		sortData[1] = src[globalAddr.y];
+		sortData[2] = src[globalAddr.z];
+		sortData[3] = src[globalAddr.w];
+	}
+
+	uint cmpValue = ((1<<BITS_PER_PASS)-1);
+	uint4 radix = make_uint4( (sortData[0].m_key>>cb.m_startBit)&cmpValue, (sortData[1].m_key>>cb.m_startBit)&cmpValue, 
+		(sortData[2].m_key>>cb.m_startBit)&cmpValue, (sortData[3].m_key>>cb.m_startBit)&cmpValue );;
+
+	//	data is already sorted. So simply subtract local prefix sum
+	uint4 dstAddr;
+	dstAddr.x = ldsGlobalHistogram[radix.x] + (localAddr.x - ldsLocalHistogram[radix.x]);
+	dstAddr.y = ldsGlobalHistogram[radix.y] + (localAddr.y - ldsLocalHistogram[radix.y]);
+	dstAddr.z = ldsGlobalHistogram[radix.z] + (localAddr.z - ldsLocalHistogram[radix.z]);
+	dstAddr.w = ldsGlobalHistogram[radix.w] + (localAddr.w - ldsLocalHistogram[radix.w]);
+
+	dst[dstAddr.x] = sortData[0];
+	dst[dstAddr.y] = sortData[1];
+	dst[dstAddr.z] = sortData[2];
+	dst[dstAddr.w] = sortData[3];
+}
+
+__kernel
+__attribute__((reqd_work_group_size(WG_SIZE,1,1)))
+void CopyKernel(__global SortData *src, __global SortData *dst)
+{
+	dst[ GET_GLOBAL_IDX ] = src[ GET_GLOBAL_IDX ];
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernels.hlsl
@@ -0,0 +1,322 @@
+/*
+		2011 Takahiro Harada
+*/
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+#define GROUP_MEM_FENCE
+#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
+#define AtomInc(x) InterlockedAdd(x, 1)
+#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
+
+#define make_uint4 uint4
+#define make_uint2 uint2
+
+uint4 SELECT_UINT4(uint4 b,uint4 a,uint4 condition ){ return  make_uint4( ((condition).x)?a.x:b.x, ((condition).y)?a.y:b.y, ((condition).z)?a.z:b.z, ((condition).w)?a.w:b.w ); }
+
+//	takahiro end
+#define WG_SIZE 128
+#define NUM_PER_WI 4
+
+#define GET_GROUP_SIZE WG_SIZE
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+cbuffer SortCB : register( b0 )
+{
+	u32 m_startBit;
+	u32 m_numGroups;
+	u32 m_padding[2];
+};
+
+#define BITS_PER_PASS 4
+
+
+uint4 prefixScanVector( uint4 data )
+{
+	data.y += data.x;
+	data.w += data.z;
+	data.z += data.y;
+	data.w += data.y;
+	return data;
+}
+
+uint prefixScanVectorEx( inout uint4 data )
+{
+	uint4 backup = data;
+	data.y += data.x;
+	data.w += data.z;
+	data.z += data.y;
+	data.w += data.y;
+	uint sum = data.w;
+	data -= backup;
+	return sum;
+}
+
+
+
+RWStructuredBuffer<SortData> sortDataIn : register( u0 );
+RWStructuredBuffer<u32> ldsHistogramOut0 : register( u1 );
+RWStructuredBuffer<u32> ldsHistogramOut1 : register( u2 );
+
+groupshared u32 ldsSortData[ WG_SIZE*NUM_PER_WI + 16 ];
+
+
+uint4 localPrefixSum128V( uint4 pData, uint lIdx, inout uint totalSum )
+{
+	{	//	Set data
+		ldsSortData[lIdx] = 0;
+		ldsSortData[lIdx+WG_SIZE] = prefixScanVectorEx( pData );
+	}
+
+	GROUP_LDS_BARRIER;
+
+	{	//	Prefix sum
+		int idx = 2*lIdx + (WG_SIZE+1);
+		if( lIdx < 64 )
+		{
+			ldsSortData[idx] += ldsSortData[idx-1];
+			GROUP_MEM_FENCE;
+			ldsSortData[idx] += ldsSortData[idx-2];					
+			GROUP_MEM_FENCE;
+			ldsSortData[idx] += ldsSortData[idx-4];
+			GROUP_MEM_FENCE;
+			ldsSortData[idx] += ldsSortData[idx-8];
+			GROUP_MEM_FENCE;
+			ldsSortData[idx] += ldsSortData[idx-16];
+			GROUP_MEM_FENCE;
+			ldsSortData[idx] += ldsSortData[idx-32];		
+			GROUP_MEM_FENCE;
+			ldsSortData[idx] += ldsSortData[idx-64];
+			GROUP_MEM_FENCE;
+
+			ldsSortData[idx-1] += ldsSortData[idx-2];
+			GROUP_MEM_FENCE;
+		}
+	}
+
+	GROUP_LDS_BARRIER;
+
+	totalSum = ldsSortData[WG_SIZE*2-1];
+	uint addValue = ldsSortData[lIdx+127];
+	return pData + make_uint4(addValue, addValue, addValue, addValue);
+}
+
+void generateHistogram(u32 lIdx, u32 wgIdx, 
+		uint4 sortedData)
+{
+    if( lIdx < (1<<BITS_PER_PASS) )
+    {
+    	ldsSortData[lIdx] = 0;
+    }
+
+	int mask = ((1<<BITS_PER_PASS)-1);
+	uint4 keys = make_uint4( (sortedData.x)&mask, (sortedData.y)&mask, (sortedData.z)&mask, (sortedData.w)&mask );
+
+	GROUP_LDS_BARRIER;
+	
+	AtomInc( ldsSortData[keys.x] );
+	AtomInc( ldsSortData[keys.y] );
+	AtomInc( ldsSortData[keys.z] );
+	AtomInc( ldsSortData[keys.w] );
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void LocalSortKernel( DEFAULT_ARGS )
+{
+	int nElemsPerWG = WG_SIZE*NUM_PER_WI;
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 wgSize = GET_GROUP_SIZE;
+
+    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
+
+
+	SortData sortData[NUM_PER_WI];
+
+	{
+		u32 offset = nElemsPerWG*wgIdx;
+		sortData[0] = sortDataIn[offset+localAddr.x];
+		sortData[1] = sortDataIn[offset+localAddr.y];
+		sortData[2] = sortDataIn[offset+localAddr.z];
+		sortData[3] = sortDataIn[offset+localAddr.w];
+	}
+
+	int bitIdx = m_startBit;
+	do
+	{
+//	what is this?
+//		if( lIdx == wgSize-1 ) ldsSortData[256] = sortData[3].m_key;
+		u32 mask = (1<<bitIdx);
+		uint4 cmpResult = make_uint4( sortData[0].m_key & mask, sortData[1].m_key & mask, sortData[2].m_key & mask, sortData[3].m_key & mask );
+		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );
+		u32 total;
+		prefixSum = localPrefixSum128V( prefixSum, lIdx, total );
+
+		{
+			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );
+			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );
+
+			GROUP_LDS_BARRIER;
+
+			ldsSortData[dstAddr.x] = sortData[0].m_key;
+			ldsSortData[dstAddr.y] = sortData[1].m_key;
+			ldsSortData[dstAddr.z] = sortData[2].m_key;
+			ldsSortData[dstAddr.w] = sortData[3].m_key;
+
+			GROUP_LDS_BARRIER;
+
+			sortData[0].m_key = ldsSortData[localAddr.x];
+			sortData[1].m_key = ldsSortData[localAddr.y];
+			sortData[2].m_key = ldsSortData[localAddr.z];
+			sortData[3].m_key = ldsSortData[localAddr.w];
+
+			GROUP_LDS_BARRIER;
+
+			ldsSortData[dstAddr.x] = sortData[0].m_value;
+			ldsSortData[dstAddr.y] = sortData[1].m_value;
+			ldsSortData[dstAddr.z] = sortData[2].m_value;
+			ldsSortData[dstAddr.w] = sortData[3].m_value;
+
+			GROUP_LDS_BARRIER;
+
+			sortData[0].m_value = ldsSortData[localAddr.x];
+			sortData[1].m_value = ldsSortData[localAddr.y];
+			sortData[2].m_value = ldsSortData[localAddr.z];
+			sortData[3].m_value = ldsSortData[localAddr.w];
+
+			GROUP_LDS_BARRIER;
+		}
+		bitIdx ++;
+	}
+	while( bitIdx <(m_startBit+BITS_PER_PASS) );
+
+	{	//	generate historgram
+		uint4 localKeys = make_uint4( sortData[0].m_key>>m_startBit, sortData[1].m_key>>m_startBit, 
+			sortData[2].m_key>>m_startBit, sortData[3].m_key>>m_startBit );
+
+		generateHistogram( lIdx, wgIdx, localKeys );
+
+		GROUP_LDS_BARRIER;
+
+		int nBins = (1<<BITS_PER_PASS);
+		if( lIdx < nBins )
+		{
+     		u32 histValues = ldsSortData[lIdx];
+
+     		u32 globalAddresses = nBins*wgIdx + lIdx;
+     		u32 globalAddressesRadixMajor = m_numGroups*lIdx + wgIdx;
+		
+     		ldsHistogramOut0[globalAddressesRadixMajor] = histValues;
+     		ldsHistogramOut1[globalAddresses] = histValues;
+		}
+	}
+
+	{	//	write
+		u32 offset = nElemsPerWG*wgIdx;
+		uint4 dstAddr = make_uint4(offset+localAddr.x, offset+localAddr.y, offset+localAddr.z, offset+localAddr.w );
+
+		sortDataIn[ dstAddr.x + 0 ] = sortData[0];
+		sortDataIn[ dstAddr.x + 1 ] = sortData[1];
+		sortDataIn[ dstAddr.x + 2 ] = sortData[2];
+		sortDataIn[ dstAddr.x + 3 ] = sortData[3];
+	}
+}
+
+StructuredBuffer<SortData> src : register( t0 );
+StructuredBuffer<u32> histogramGlobalRadixMajor : register( t1 );
+StructuredBuffer<u32> histogramLocalGroupMajor : register( t2 );
+
+RWStructuredBuffer<SortData> dst : register( u0 );
+
+groupshared u32 ldsLocalHistogram[ 2*(1<<BITS_PER_PASS) ];
+groupshared u32 ldsGlobalHistogram[ (1<<BITS_PER_PASS) ];
+
+
+[numthreads(WG_SIZE, 1, 1)]
+void ScatterKernel( DEFAULT_ARGS )
+{
+	u32 lIdx = GET_LOCAL_IDX;
+	u32 wgIdx = GET_GROUP_IDX;
+	u32 ldsOffset = (1<<BITS_PER_PASS);
+
+	//	load and prefix scan local histogram
+	if( lIdx < ((1<<BITS_PER_PASS)/2) )
+	{
+		uint2 myIdx = make_uint2(lIdx, lIdx+8);
+
+		ldsLocalHistogram[ldsOffset+myIdx.x] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.x];
+		ldsLocalHistogram[ldsOffset+myIdx.y] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.y];
+		ldsLocalHistogram[ldsOffset+myIdx.x-(1<<BITS_PER_PASS)] = 0;
+		ldsLocalHistogram[ldsOffset+myIdx.y-(1<<BITS_PER_PASS)] = 0;
+
+		int idx = ldsOffset+2*lIdx;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-1];
+		GROUP_MEM_FENCE;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-2];
+		GROUP_MEM_FENCE;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-4];
+		GROUP_MEM_FENCE;
+		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-8];
+		GROUP_MEM_FENCE;
+
+		// Propagate intermediate values through
+		ldsLocalHistogram[idx-1] += ldsLocalHistogram[idx-2];
+		GROUP_MEM_FENCE;
+
+		// Grab and propagate for whole WG - loading the - 1 value
+		uint2 localValues;
+		localValues.x = ldsLocalHistogram[ldsOffset+myIdx.x-1];
+		localValues.y = ldsLocalHistogram[ldsOffset+myIdx.y-1];
+
+		ldsLocalHistogram[myIdx.x] = localValues.x;
+		ldsLocalHistogram[myIdx.y] = localValues.y;
+
+
+		ldsGlobalHistogram[myIdx.x] = histogramGlobalRadixMajor[m_numGroups*myIdx.x + wgIdx];
+		ldsGlobalHistogram[myIdx.y] = histogramGlobalRadixMajor[m_numGroups*myIdx.y + wgIdx];
+	}
+
+	GROUP_LDS_BARRIER;
+
+    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
+
+	SortData sortData[4];
+	{
+	    uint4 globalAddr = wgIdx*WG_SIZE*NUM_PER_WI + localAddr;
+		sortData[0] = src[globalAddr.x];
+		sortData[1] = src[globalAddr.y];
+		sortData[2] = src[globalAddr.z];
+		sortData[3] = src[globalAddr.w];
+	}
+
+	uint cmpValue = ((1<<BITS_PER_PASS)-1);
+	uint4 radix = make_uint4( (sortData[0].m_key>>m_startBit)&cmpValue, (sortData[1].m_key>>m_startBit)&cmpValue, 
+		(sortData[2].m_key>>m_startBit)&cmpValue, (sortData[3].m_key>>m_startBit)&cmpValue );;
+
+	//	data is already sorted. So simply subtract local prefix sum
+	uint4 dstAddr;
+	dstAddr.x = ldsGlobalHistogram[radix.x] + (localAddr.x - ldsLocalHistogram[radix.x]);
+	dstAddr.y = ldsGlobalHistogram[radix.y] + (localAddr.y - ldsLocalHistogram[radix.y]);
+	dstAddr.z = ldsGlobalHistogram[radix.z] + (localAddr.z - ldsLocalHistogram[radix.z]);
+	dstAddr.w = ldsGlobalHistogram[radix.w] + (localAddr.w - ldsLocalHistogram[radix.w]);
+
+	dst[dstAddr.x] = sortData[0];
+	dst[dstAddr.y] = sortData[1];
+	dst[dstAddr.z] = sortData[2];
+	dst[dstAddr.w] = sortData[3];
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void CopyKernel( DEFAULT_ARGS )
+{
+	dst[ GET_GLOBAL_IDX ] = src[ GET_GLOBAL_IDX ];
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernelsCL.h
@@ -0,0 +1,347 @@
+static const char* radixSortStandardKernelsCL= \
+"/*\n"
+"Bullet Continuous Collision Detection and Physics Library\n"
+"Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org\n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Author Takahiro Harada\n"
+"\n"
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"\n"
+"#define make_uint4 (uint4)\n"
+"#define make_uint2 (uint2)\n"
+"\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"\n"
+"#define WG_SIZE 128\n"
+"#define NUM_PER_WI 4\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_startBit;\n"
+"	u32 m_numGroups;\n"
+"	u32 m_padding[2];\n"
+"} ConstBuffer;\n"
+"\n"
+"#define BITS_PER_PASS 4\n"
+"\n"
+"\n"
+"\n"
+"uint4 prefixScanVector( uint4 data )\n"
+"{\n"
+"	data.y += data.x;\n"
+"	data.w += data.z;\n"
+"	data.z += data.y;\n"
+"	data.w += data.y;\n"
+"	return data;\n"
+"}\n"
+"\n"
+"uint prefixScanVectorEx( uint4* data )\n"
+"{\n"
+"	uint4 backup = data[0];\n"
+"	data[0].y += data[0].x;\n"
+"	data[0].w += data[0].z;\n"
+"	data[0].z += data[0].y;\n"
+"	data[0].w += data[0].y;\n"
+"	uint sum = data[0].w;\n"
+"	*data -= backup;\n"
+"	return sum;\n"
+"}\n"
+"\n"
+"uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32 sorterSharedMemory[] )\n"
+"{\n"
+"	{	//	Set data\n"
+"		sorterSharedMemory[lIdx] = 0;\n"
+"		sorterSharedMemory[lIdx+WG_SIZE] = prefixScanVectorEx( &pData );\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	Prefix sum\n"
+"		int idx = 2*lIdx + (WG_SIZE+1);\n"
+"		if( lIdx < 64 )\n"
+"		{\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];					\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		\n"
+"			GROUP_MEM_FENCE;\n"
+"			sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+"			GROUP_MEM_FENCE;\n"
+"\n"
+"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+"			GROUP_MEM_FENCE;\n"
+"		}\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	*totalSum = sorterSharedMemory[WG_SIZE*2-1];\n"
+"	uint addValue = sorterSharedMemory[lIdx+127];\n"
+"	return pData + make_uint4(addValue, addValue, addValue, addValue);\n"
+"}\n"
+"\n"
+"\n"
+"void generateHistogram(u32 lIdx, u32 wgIdx, \n"
+"		uint4 sortedData,\n"
+"		__local u32 *histogram)\n"
+"{\n"
+"    if( lIdx < (1<<BITS_PER_PASS) )\n"
+"    {\n"
+"    	histogram[lIdx] = 0;\n"
+"    }\n"
+"\n"
+"	int mask = ((1<<BITS_PER_PASS)-1);\n"
+"	uint4 keys = make_uint4( (sortedData.x)&mask, (sortedData.y)&mask, (sortedData.z)&mask, (sortedData.w)&mask );\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	AtomInc( histogram[keys.x] );\n"
+"	AtomInc( histogram[keys.y] );\n"
+"	AtomInc( histogram[keys.z] );\n"
+"	AtomInc( histogram[keys.w] );\n"
+"}\n"
+"\n"
+"//\n"
+"//\n"
+"//\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void LocalSortKernel(__global SortData* sortDataIn, \n"
+"						__global u32* ldsHistogramOut0,\n"
+"						__global u32* ldsHistogramOut1,\n"
+"						ConstBuffer cb)\n"
+"{\n"
+"\n"
+"	__local u32 ldsSortData[ WG_SIZE*NUM_PER_WI + 16 ];\n"
+"\n"
+"	int nElemsPerWG = WG_SIZE*NUM_PER_WI;\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"\n"
+"    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
+"\n"
+"\n"
+"	SortData sortData[NUM_PER_WI];\n"
+"\n"
+"	{\n"
+"		u32 offset = nElemsPerWG*wgIdx;\n"
+"		sortData[0] = sortDataIn[offset+localAddr.x];\n"
+"		sortData[1] = sortDataIn[offset+localAddr.y];\n"
+"		sortData[2] = sortDataIn[offset+localAddr.z];\n"
+"		sortData[3] = sortDataIn[offset+localAddr.w];\n"
+"	}\n"
+"\n"
+"	int bitIdx = cb.m_startBit;\n"
+"	do\n"
+"	{\n"
+"//	what is this?\n"
+"//		if( lIdx == wgSize-1 ) ldsSortData[256] = sortData[3].m_key;\n"
+"		u32 mask = (1<<bitIdx);\n"
+"		uint4 cmpResult = make_uint4( sortData[0].m_key & mask, sortData[1].m_key & mask, sortData[2].m_key & mask, sortData[3].m_key & mask );\n"
+"		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n"
+"		u32 total;\n"
+"		prefixSum = localPrefixSum128V( prefixSum, lIdx, &total, ldsSortData );\n"
+"\n"
+"		{\n"
+"			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n"
+"			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			ldsSortData[dstAddr.x] = sortData[0].m_key;\n"
+"			ldsSortData[dstAddr.y] = sortData[1].m_key;\n"
+"			ldsSortData[dstAddr.z] = sortData[2].m_key;\n"
+"			ldsSortData[dstAddr.w] = sortData[3].m_key;\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			sortData[0].m_key = ldsSortData[localAddr.x];\n"
+"			sortData[1].m_key = ldsSortData[localAddr.y];\n"
+"			sortData[2].m_key = ldsSortData[localAddr.z];\n"
+"			sortData[3].m_key = ldsSortData[localAddr.w];\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			ldsSortData[dstAddr.x] = sortData[0].m_value;\n"
+"			ldsSortData[dstAddr.y] = sortData[1].m_value;\n"
+"			ldsSortData[dstAddr.z] = sortData[2].m_value;\n"
+"			ldsSortData[dstAddr.w] = sortData[3].m_value;\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			sortData[0].m_value = ldsSortData[localAddr.x];\n"
+"			sortData[1].m_value = ldsSortData[localAddr.y];\n"
+"			sortData[2].m_value = ldsSortData[localAddr.z];\n"
+"			sortData[3].m_value = ldsSortData[localAddr.w];\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"		}\n"
+"		bitIdx ++;\n"
+"	}\n"
+"	while( bitIdx <(cb.m_startBit+BITS_PER_PASS) );\n"
+"\n"
+"	{	//	generate historgram\n"
+"		uint4 localKeys = make_uint4( sortData[0].m_key>>cb.m_startBit, sortData[1].m_key>>cb.m_startBit, \n"
+"			sortData[2].m_key>>cb.m_startBit, sortData[3].m_key>>cb.m_startBit );\n"
+"\n"
+"		generateHistogram( lIdx, wgIdx, localKeys, ldsSortData );\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		int nBins = (1<<BITS_PER_PASS);\n"
+"		if( lIdx < nBins )\n"
+"		{\n"
+"     		u32 histValues = ldsSortData[lIdx];\n"
+"\n"
+"     		u32 globalAddresses = nBins*wgIdx + lIdx;\n"
+"     		u32 globalAddressesRadixMajor = cb.m_numGroups*lIdx + wgIdx;\n"
+"		\n"
+"     		ldsHistogramOut0[globalAddressesRadixMajor] = histValues;\n"
+"     		ldsHistogramOut1[globalAddresses] = histValues;\n"
+"		}\n"
+"	}\n"
+"\n"
+"\n"
+"	{	//	write\n"
+"		u32 offset = nElemsPerWG*wgIdx;\n"
+"		uint4 dstAddr = make_uint4(offset+localAddr.x, offset+localAddr.y, offset+localAddr.z, offset+localAddr.w );\n"
+"\n"
+"		sortDataIn[ dstAddr.x + 0 ] = sortData[0];\n"
+"		sortDataIn[ dstAddr.x + 1 ] = sortData[1];\n"
+"		sortDataIn[ dstAddr.x + 2 ] = sortData[2];\n"
+"		sortDataIn[ dstAddr.x + 3 ] = sortData[3];\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void ScatterKernel(__global SortData *src,\n"
+"		__global u32 *histogramGlobalRadixMajor,\n"
+"		__global u32 *histogramLocalGroupMajor,\n"
+"		__global SortData *dst,\n"
+"		ConstBuffer cb)\n"
+"{\n"
+"	__local u32 sorterLocalMemory[3*(1<<BITS_PER_PASS)];\n"
+"	__local u32 *ldsLocalHistogram = sorterLocalMemory + (1<<BITS_PER_PASS);\n"
+"	__local u32 *ldsGlobalHistogram = sorterLocalMemory;\n"
+"\n"
+"\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 ldsOffset = (1<<BITS_PER_PASS);\n"
+"\n"
+"	//	load and prefix scan local histogram\n"
+"	if( lIdx < ((1<<BITS_PER_PASS)/2) )\n"
+"	{\n"
+"		uint2 myIdx = make_uint2(lIdx, lIdx+8);\n"
+"\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.x] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.x];\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.y] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.y];\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.x-(1<<BITS_PER_PASS)] = 0;\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.y-(1<<BITS_PER_PASS)] = 0;\n"
+"\n"
+"		int idx = ldsOffset+2*lIdx;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-1];\n"
+"		GROUP_MEM_FENCE;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-2];\n"
+"		GROUP_MEM_FENCE;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-4];\n"
+"		GROUP_MEM_FENCE;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-8];\n"
+"		GROUP_MEM_FENCE;\n"
+"\n"
+"		// Propagate intermediate values through\n"
+"		ldsLocalHistogram[idx-1] += ldsLocalHistogram[idx-2];\n"
+"		GROUP_MEM_FENCE;\n"
+"\n"
+"		// Grab and propagate for whole WG - loading the - 1 value\n"
+"		uint2 localValues;\n"
+"		localValues.x = ldsLocalHistogram[ldsOffset+myIdx.x-1];\n"
+"		localValues.y = ldsLocalHistogram[ldsOffset+myIdx.y-1];\n"
+"\n"
+"		ldsLocalHistogram[myIdx.x] = localValues.x;\n"
+"		ldsLocalHistogram[myIdx.y] = localValues.y;\n"
+"\n"
+"\n"
+"		ldsGlobalHistogram[myIdx.x] = histogramGlobalRadixMajor[cb.m_numGroups*myIdx.x + wgIdx];\n"
+"		ldsGlobalHistogram[myIdx.y] = histogramGlobalRadixMajor[cb.m_numGroups*myIdx.y + wgIdx];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
+"\n"
+"	SortData sortData[4];\n"
+"	{\n"
+"	    uint4 globalAddr = wgIdx*WG_SIZE*NUM_PER_WI + localAddr;\n"
+"		sortData[0] = src[globalAddr.x];\n"
+"		sortData[1] = src[globalAddr.y];\n"
+"		sortData[2] = src[globalAddr.z];\n"
+"		sortData[3] = src[globalAddr.w];\n"
+"	}\n"
+"\n"
+"	uint cmpValue = ((1<<BITS_PER_PASS)-1);\n"
+"	uint4 radix = make_uint4( (sortData[0].m_key>>cb.m_startBit)&cmpValue, (sortData[1].m_key>>cb.m_startBit)&cmpValue, \n"
+"		(sortData[2].m_key>>cb.m_startBit)&cmpValue, (sortData[3].m_key>>cb.m_startBit)&cmpValue );;\n"
+"\n"
+"	//	data is already sorted. So simply subtract local prefix sum\n"
+"	uint4 dstAddr;\n"
+"	dstAddr.x = ldsGlobalHistogram[radix.x] + (localAddr.x - ldsLocalHistogram[radix.x]);\n"
+"	dstAddr.y = ldsGlobalHistogram[radix.y] + (localAddr.y - ldsLocalHistogram[radix.y]);\n"
+"	dstAddr.z = ldsGlobalHistogram[radix.z] + (localAddr.z - ldsLocalHistogram[radix.z]);\n"
+"	dstAddr.w = ldsGlobalHistogram[radix.w] + (localAddr.w - ldsLocalHistogram[radix.w]);\n"
+"\n"
+"	dst[dstAddr.x] = sortData[0];\n"
+"	dst[dstAddr.y] = sortData[1];\n"
+"	dst[dstAddr.z] = sortData[2];\n"
+"	dst[dstAddr.w] = sortData[3];\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+"void CopyKernel(__global SortData *src, __global SortData *dst)\n"
+"{\n"
+"	dst[ GET_GLOBAL_IDX ] = src[ GET_GLOBAL_IDX ];\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/RadixSortStandardKernelsDX11.h
@@ -0,0 +1,324 @@
+static const char* radixSortStandardKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"#define GROUP_MEM_FENCE\n"
+"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+"\n"
+"#define make_uint4 uint4\n"
+"#define make_uint2 uint2\n"
+"\n"
+"uint4 SELECT_UINT4(uint4 b,uint4 a,uint4 condition ){ return  make_uint4( ((condition).x)?a.x:b.x, ((condition).y)?a.y:b.y, ((condition).z)?a.z:b.z, ((condition).w)?a.w:b.w ); }\n"
+"\n"
+"//	takahiro end\n"
+"#define WG_SIZE 128\n"
+"#define NUM_PER_WI 4\n"
+"\n"
+"#define GET_GROUP_SIZE WG_SIZE\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"cbuffer SortCB : register( b0 )\n"
+"{\n"
+"	u32 m_startBit;\n"
+"	u32 m_numGroups;\n"
+"	u32 m_padding[2];\n"
+"};\n"
+"\n"
+"#define BITS_PER_PASS 4\n"
+"\n"
+"\n"
+"uint4 prefixScanVector( uint4 data )\n"
+"{\n"
+"	data.y += data.x;\n"
+"	data.w += data.z;\n"
+"	data.z += data.y;\n"
+"	data.w += data.y;\n"
+"	return data;\n"
+"}\n"
+"\n"
+"uint prefixScanVectorEx( inout uint4 data )\n"
+"{\n"
+"	uint4 backup = data;\n"
+"	data.y += data.x;\n"
+"	data.w += data.z;\n"
+"	data.z += data.y;\n"
+"	data.w += data.y;\n"
+"	uint sum = data.w;\n"
+"	data -= backup;\n"
+"	return sum;\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"RWStructuredBuffer<SortData> sortDataIn : register( u0 );\n"
+"RWStructuredBuffer<u32> ldsHistogramOut0 : register( u1 );\n"
+"RWStructuredBuffer<u32> ldsHistogramOut1 : register( u2 );\n"
+"\n"
+"groupshared u32 ldsSortData[ WG_SIZE*NUM_PER_WI + 16 ];\n"
+"\n"
+"\n"
+"uint4 localPrefixSum128V( uint4 pData, uint lIdx, inout uint totalSum )\n"
+"{\n"
+"	{	//	Set data\n"
+"		ldsSortData[lIdx] = 0;\n"
+"		ldsSortData[lIdx+WG_SIZE] = prefixScanVectorEx( pData );\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	{	//	Prefix sum\n"
+"		int idx = 2*lIdx + (WG_SIZE+1);\n"
+"		if( lIdx < 64 )\n"
+"		{\n"
+"			ldsSortData[idx] += ldsSortData[idx-1];\n"
+"			GROUP_MEM_FENCE;\n"
+"			ldsSortData[idx] += ldsSortData[idx-2];					\n"
+"			GROUP_MEM_FENCE;\n"
+"			ldsSortData[idx] += ldsSortData[idx-4];\n"
+"			GROUP_MEM_FENCE;\n"
+"			ldsSortData[idx] += ldsSortData[idx-8];\n"
+"			GROUP_MEM_FENCE;\n"
+"			ldsSortData[idx] += ldsSortData[idx-16];\n"
+"			GROUP_MEM_FENCE;\n"
+"			ldsSortData[idx] += ldsSortData[idx-32];		\n"
+"			GROUP_MEM_FENCE;\n"
+"			ldsSortData[idx] += ldsSortData[idx-64];\n"
+"			GROUP_MEM_FENCE;\n"
+"\n"
+"			ldsSortData[idx-1] += ldsSortData[idx-2];\n"
+"			GROUP_MEM_FENCE;\n"
+"		}\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"	totalSum = ldsSortData[WG_SIZE*2-1];\n"
+"	uint addValue = ldsSortData[lIdx+127];\n"
+"	return pData + make_uint4(addValue, addValue, addValue, addValue);\n"
+"}\n"
+"\n"
+"void generateHistogram(u32 lIdx, u32 wgIdx, \n"
+"		uint4 sortedData)\n"
+"{\n"
+"    if( lIdx < (1<<BITS_PER_PASS) )\n"
+"    {\n"
+"    	ldsSortData[lIdx] = 0;\n"
+"    }\n"
+"\n"
+"	int mask = ((1<<BITS_PER_PASS)-1);\n"
+"	uint4 keys = make_uint4( (sortedData.x)&mask, (sortedData.y)&mask, (sortedData.z)&mask, (sortedData.w)&mask );\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"	\n"
+"	AtomInc( ldsSortData[keys.x] );\n"
+"	AtomInc( ldsSortData[keys.y] );\n"
+"	AtomInc( ldsSortData[keys.z] );\n"
+"	AtomInc( ldsSortData[keys.w] );\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void LocalSortKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int nElemsPerWG = WG_SIZE*NUM_PER_WI;\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 wgSize = GET_GROUP_SIZE;\n"
+"\n"
+"    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
+"\n"
+"\n"
+"	SortData sortData[NUM_PER_WI];\n"
+"\n"
+"	{\n"
+"		u32 offset = nElemsPerWG*wgIdx;\n"
+"		sortData[0] = sortDataIn[offset+localAddr.x];\n"
+"		sortData[1] = sortDataIn[offset+localAddr.y];\n"
+"		sortData[2] = sortDataIn[offset+localAddr.z];\n"
+"		sortData[3] = sortDataIn[offset+localAddr.w];\n"
+"	}\n"
+"\n"
+"	int bitIdx = m_startBit;\n"
+"	do\n"
+"	{\n"
+"//	what is this?\n"
+"//		if( lIdx == wgSize-1 ) ldsSortData[256] = sortData[3].m_key;\n"
+"		u32 mask = (1<<bitIdx);\n"
+"		uint4 cmpResult = make_uint4( sortData[0].m_key & mask, sortData[1].m_key & mask, sortData[2].m_key & mask, sortData[3].m_key & mask );\n"
+"		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n"
+"		u32 total;\n"
+"		prefixSum = localPrefixSum128V( prefixSum, lIdx, total );\n"
+"\n"
+"		{\n"
+"			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n"
+"			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			ldsSortData[dstAddr.x] = sortData[0].m_key;\n"
+"			ldsSortData[dstAddr.y] = sortData[1].m_key;\n"
+"			ldsSortData[dstAddr.z] = sortData[2].m_key;\n"
+"			ldsSortData[dstAddr.w] = sortData[3].m_key;\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			sortData[0].m_key = ldsSortData[localAddr.x];\n"
+"			sortData[1].m_key = ldsSortData[localAddr.y];\n"
+"			sortData[2].m_key = ldsSortData[localAddr.z];\n"
+"			sortData[3].m_key = ldsSortData[localAddr.w];\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			ldsSortData[dstAddr.x] = sortData[0].m_value;\n"
+"			ldsSortData[dstAddr.y] = sortData[1].m_value;\n"
+"			ldsSortData[dstAddr.z] = sortData[2].m_value;\n"
+"			ldsSortData[dstAddr.w] = sortData[3].m_value;\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"\n"
+"			sortData[0].m_value = ldsSortData[localAddr.x];\n"
+"			sortData[1].m_value = ldsSortData[localAddr.y];\n"
+"			sortData[2].m_value = ldsSortData[localAddr.z];\n"
+"			sortData[3].m_value = ldsSortData[localAddr.w];\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"		}\n"
+"		bitIdx ++;\n"
+"	}\n"
+"	while( bitIdx <(m_startBit+BITS_PER_PASS) );\n"
+"\n"
+"	{	//	generate historgram\n"
+"		uint4 localKeys = make_uint4( sortData[0].m_key>>m_startBit, sortData[1].m_key>>m_startBit, \n"
+"			sortData[2].m_key>>m_startBit, sortData[3].m_key>>m_startBit );\n"
+"\n"
+"		generateHistogram( lIdx, wgIdx, localKeys );\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		int nBins = (1<<BITS_PER_PASS);\n"
+"		if( lIdx < nBins )\n"
+"		{\n"
+"     		u32 histValues = ldsSortData[lIdx];\n"
+"\n"
+"     		u32 globalAddresses = nBins*wgIdx + lIdx;\n"
+"     		u32 globalAddressesRadixMajor = m_numGroups*lIdx + wgIdx;\n"
+"		\n"
+"     		ldsHistogramOut0[globalAddressesRadixMajor] = histValues;\n"
+"     		ldsHistogramOut1[globalAddresses] = histValues;\n"
+"		}\n"
+"	}\n"
+"\n"
+"	{	//	write\n"
+"		u32 offset = nElemsPerWG*wgIdx;\n"
+"		uint4 dstAddr = make_uint4(offset+localAddr.x, offset+localAddr.y, offset+localAddr.z, offset+localAddr.w );\n"
+"\n"
+"		sortDataIn[ dstAddr.x + 0 ] = sortData[0];\n"
+"		sortDataIn[ dstAddr.x + 1 ] = sortData[1];\n"
+"		sortDataIn[ dstAddr.x + 2 ] = sortData[2];\n"
+"		sortDataIn[ dstAddr.x + 3 ] = sortData[3];\n"
+"	}\n"
+"}\n"
+"\n"
+"StructuredBuffer<SortData> src : register( t0 );\n"
+"StructuredBuffer<u32> histogramGlobalRadixMajor : register( t1 );\n"
+"StructuredBuffer<u32> histogramLocalGroupMajor : register( t2 );\n"
+"\n"
+"RWStructuredBuffer<SortData> dst : register( u0 );\n"
+"\n"
+"groupshared u32 ldsLocalHistogram[ 2*(1<<BITS_PER_PASS) ];\n"
+"groupshared u32 ldsGlobalHistogram[ (1<<BITS_PER_PASS) ];\n"
+"\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void ScatterKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	u32 lIdx = GET_LOCAL_IDX;\n"
+"	u32 wgIdx = GET_GROUP_IDX;\n"
+"	u32 ldsOffset = (1<<BITS_PER_PASS);\n"
+"\n"
+"	//	load and prefix scan local histogram\n"
+"	if( lIdx < ((1<<BITS_PER_PASS)/2) )\n"
+"	{\n"
+"		uint2 myIdx = make_uint2(lIdx, lIdx+8);\n"
+"\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.x] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.x];\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.y] = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*wgIdx + myIdx.y];\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.x-(1<<BITS_PER_PASS)] = 0;\n"
+"		ldsLocalHistogram[ldsOffset+myIdx.y-(1<<BITS_PER_PASS)] = 0;\n"
+"\n"
+"		int idx = ldsOffset+2*lIdx;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-1];\n"
+"		GROUP_MEM_FENCE;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-2];\n"
+"		GROUP_MEM_FENCE;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-4];\n"
+"		GROUP_MEM_FENCE;\n"
+"		ldsLocalHistogram[idx] += ldsLocalHistogram[idx-8];\n"
+"		GROUP_MEM_FENCE;\n"
+"\n"
+"		// Propagate intermediate values through\n"
+"		ldsLocalHistogram[idx-1] += ldsLocalHistogram[idx-2];\n"
+"		GROUP_MEM_FENCE;\n"
+"\n"
+"		// Grab and propagate for whole WG - loading the - 1 value\n"
+"		uint2 localValues;\n"
+"		localValues.x = ldsLocalHistogram[ldsOffset+myIdx.x-1];\n"
+"		localValues.y = ldsLocalHistogram[ldsOffset+myIdx.y-1];\n"
+"\n"
+"		ldsLocalHistogram[myIdx.x] = localValues.x;\n"
+"		ldsLocalHistogram[myIdx.y] = localValues.y;\n"
+"\n"
+"\n"
+"		ldsGlobalHistogram[myIdx.x] = histogramGlobalRadixMajor[m_numGroups*myIdx.x + wgIdx];\n"
+"		ldsGlobalHistogram[myIdx.y] = histogramGlobalRadixMajor[m_numGroups*myIdx.y + wgIdx];\n"
+"	}\n"
+"\n"
+"	GROUP_LDS_BARRIER;\n"
+"\n"
+"    uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
+"\n"
+"	SortData sortData[4];\n"
+"	{\n"
+"	    uint4 globalAddr = wgIdx*WG_SIZE*NUM_PER_WI + localAddr;\n"
+"		sortData[0] = src[globalAddr.x];\n"
+"		sortData[1] = src[globalAddr.y];\n"
+"		sortData[2] = src[globalAddr.z];\n"
+"		sortData[3] = src[globalAddr.w];\n"
+"	}\n"
+"\n"
+"	uint cmpValue = ((1<<BITS_PER_PASS)-1);\n"
+"	uint4 radix = make_uint4( (sortData[0].m_key>>m_startBit)&cmpValue, (sortData[1].m_key>>m_startBit)&cmpValue, \n"
+"		(sortData[2].m_key>>m_startBit)&cmpValue, (sortData[3].m_key>>m_startBit)&cmpValue );;\n"
+"\n"
+"	//	data is already sorted. So simply subtract local prefix sum\n"
+"	uint4 dstAddr;\n"
+"	dstAddr.x = ldsGlobalHistogram[radix.x] + (localAddr.x - ldsLocalHistogram[radix.x]);\n"
+"	dstAddr.y = ldsGlobalHistogram[radix.y] + (localAddr.y - ldsLocalHistogram[radix.y]);\n"
+"	dstAddr.z = ldsGlobalHistogram[radix.z] + (localAddr.z - ldsLocalHistogram[radix.z]);\n"
+"	dstAddr.w = ldsGlobalHistogram[radix.w] + (localAddr.w - ldsLocalHistogram[radix.w]);\n"
+"\n"
+"	dst[dstAddr.x] = sortData[0];\n"
+"	dst[dstAddr.y] = sortData[1];\n"
+"	dst[dstAddr.z] = sortData[2];\n"
+"	dst[dstAddr.w] = sortData[3];\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void CopyKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	dst[ GET_GLOBAL_IDX ] = src[ GET_GLOBAL_IDX ];\n"
+"}\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/SortData.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/SortData.h
@@ -0,0 +1,31 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#pragma once
+
+#include <AdlPrimitives/Math/Math.h>
+
+namespace adl
+{
+
+struct SortData
+{
+	SortData(){}
+	SortData( u32 key, u32 value ) : m_key(key), m_value(value) {}
+
+	union
+	{
+		u32 m_key;
+		struct { u16 m_key16[2]; };
+	};
+	u32 m_value;
+
+	friend bool operator <(const SortData& a, const SortData& b)
+	{
+		return a.m_key < b.m_key;
+	}
+};
+
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/radixsortadvanced.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/radixsortadvanced.inl
@@ -0,0 +1,146 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#define PATH "..\\..\\AdlPrimitives\\Sort\\RadixSortAdvancedKernels"
+#define KERNEL0 "StreamCountKernel"
+#define KERNEL1 "SortAndScatterKernel1"
+#define KERNEL2 "PrefixScanKernel"
+
+template<DeviceType type>
+class RadixSortAdvanced : public RadixSortBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		enum
+		{
+			WG_SIZE = 128,
+			NUM_PER_WI = 4,
+			MAX_NUM_WORKGROUPS = 60,
+		};
+
+		struct Data : public RadixSort<type>::Data
+		{
+			Kernel* m_localCountKernel;
+			Kernel* m_scatterKernel;
+			Kernel* m_scanKernel;
+
+			Buffer<u32>* m_workBuffer0;
+			Buffer<SortData>* m_workBuffer1;
+			Buffer<int4>* m_constBuffer[32/4];
+		};
+		
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_NORMAL);
+
+		static
+		void deallocate(void* data);
+
+		static
+		void execute(void* data, Buffer<SortData>& inout, int n, int sortBits);
+};
+
+template<DeviceType type>
+typename RadixSortAdvanced<type>::Data* RadixSortAdvanced<type>::allocate(const Device* deviceData, int maxSize, Option option)
+{
+	ADLASSERT( type == deviceData->m_type );
+
+	const char* src[] = { 0, 0, 0 };
+
+	Data* data = new Data;
+	data->m_option = option;
+	data->m_deviceData = deviceData;
+
+	data->m_localCountKernel = deviceData->getKernel( PATH, KERNEL0, 0, src[type] );
+	data->m_scatterKernel = deviceData->getKernel( PATH, KERNEL1, 0, src[type] );
+	data->m_scanKernel = deviceData->getKernel( PATH, KERNEL2, 0, src[type] );
+
+	data->m_workBuffer0 = new Buffer<u32>( deviceData, MAX_NUM_WORKGROUPS*16 );
+	data->m_workBuffer1 = new Buffer<SortData>( deviceData, maxSize );
+	for(int i=0; i<32/4; i++)
+		data->m_constBuffer[i] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
+	data->m_maxSize = maxSize;
+
+	return data;
+}
+
+template<DeviceType type>
+void RadixSortAdvanced<type>::deallocate(void* rawData)
+{
+	Data* data = (Data*)rawData;
+
+	delete data->m_workBuffer0;
+	delete data->m_workBuffer1;
+	for(int i=0; i<32/4; i++)
+		delete data->m_constBuffer[i];
+	
+	delete data;
+}
+
+template<DeviceType type>
+void RadixSortAdvanced<type>::execute(void* rawData, Buffer<SortData>& inout, int n, int sortBits)
+{
+	Data* data = (Data*)rawData;
+
+	ADLASSERT( sortBits == 32 );
+
+	ADLASSERT( NUM_PER_WI == 4 );
+	ADLASSERT( n%(WG_SIZE*NUM_PER_WI) == 0 );
+	ADLASSERT( MAX_NUM_WORKGROUPS < 128*8/16 );
+
+	Buffer<SortData>* src = &inout;
+	Buffer<SortData>* dst = data->m_workBuffer1;
+
+	const Device* deviceData = data->m_deviceData;
+
+	int nBlocks = n/(NUM_PER_WI*WG_SIZE);
+	const int nWorkGroupsToExecute = min2((int)MAX_NUM_WORKGROUPS, nBlocks);
+	int nBlocksPerGroup = (nBlocks+nWorkGroupsToExecute-1)/nWorkGroupsToExecute;
+	ADLASSERT( nWorkGroupsToExecute <= MAX_NUM_WORKGROUPS );
+
+	int4 constBuffer = make_int4(0, nBlocks, nWorkGroupsToExecute, nBlocksPerGroup);
+
+	int iPass = 0;
+	int startBit = 0;
+	for(int startBit=0; startBit<32; startBit+=4, iPass++)
+	{
+		constBuffer.x = startBit;
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( data->m_workBuffer0 ) };
+
+			Launcher launcher( deviceData, data->m_localCountKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
+			launcher.launch1D( WG_SIZE* nWorkGroupsToExecute, WG_SIZE );
+		}
+
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( data->m_workBuffer0 ) };
+
+			Launcher launcher( deviceData, data->m_scanKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
+			launcher.launch1D( WG_SIZE, WG_SIZE );
+		}
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( data->m_workBuffer0, true ), BufferInfo( src ), BufferInfo( dst ) };
+
+			Launcher launcher( deviceData, data->m_scatterKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
+			launcher.launch1D( WG_SIZE*nWorkGroupsToExecute, WG_SIZE );
+		}
+
+		swap2( src, dst );
+	}
+}
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/radixsortsimple.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Sort/radixsortsimple.inl
@@ -0,0 +1,149 @@
+/*
+		2011 Takahiro Harada
+*/
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Sort\\RadixSortSimpleKernels"
+#define KERNEL0 "LocalCountKernel"
+#define KERNEL1 "ScatterKernel"
+
+#include <AdlPrimitives/Sort/RadixSortSimpleCL.h>
+#include <AdlPrimitives/Sort/RadixSortSimpleDX11.h>
+
+template<DeviceType type>
+class RadixSortSimple : public RadixSortBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		enum
+		{
+			WG_SIZE = 128,
+			NUM_PER_WI = 4,
+		};
+
+		struct Data : public RadixSort<type>::Data
+		{
+			Kernel* m_localCountKernel;
+			Kernel* m_scatterKernel;
+
+			Buffer<u32>* m_workBuffer0;
+			Buffer<u32>* m_workBuffer1;
+			Buffer<SortData>* m_workBuffer2;
+			Buffer<int4>* m_constBuffer[4];
+		};
+		
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_NORMAL);
+
+		static
+		void deallocate(void* data);
+
+		static
+		void execute(void* data, Buffer<SortData>& inout, int n, int sortBits);
+};
+
+template<DeviceType type>
+typename RadixSortSimple<type>::Data* RadixSortSimple<type>::allocate(const Device* deviceData, int maxSize, Option option)
+{
+	ADLASSERT( type == deviceData->m_type );
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{radixSortSimpleKernelsCL, radixSortSimpleKernelsDX11};
+#else
+		{ 0, 0 };
+#endif
+	u32 maxNumGroups = (maxSize+WG_SIZE*NUM_PER_WI-1)/(WG_SIZE*NUM_PER_WI);
+
+	Data* data = new Data;
+	data->m_option = option;
+	data->m_deviceData = deviceData;
+
+	data->m_localCountKernel = deviceData->getKernel( PATH, KERNEL0, 0, src[type] );
+	data->m_scatterKernel = deviceData->getKernel( PATH, KERNEL1, 0, src[type] );
+
+	data->m_scanData = PrefixScan<type>::allocate( deviceData, maxSize );
+
+	data->m_workBuffer0 = new Buffer<u32>( deviceData, maxNumGroups*256 );
+	data->m_workBuffer1 = new Buffer<u32>( deviceData, maxNumGroups*256 );
+	data->m_workBuffer2 = new Buffer<SortData>( deviceData, maxSize );
+	data->m_constBuffer[0] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
+	data->m_constBuffer[1] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
+	data->m_constBuffer[2] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
+	data->m_constBuffer[3] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
+	data->m_maxSize = maxSize;
+
+	return data;
+}
+
+template<DeviceType type>
+void RadixSortSimple<type>::deallocate(void* rawData)
+{
+	Data* data = (Data*)rawData;
+
+	delete data->m_workBuffer0;
+	delete data->m_workBuffer1;
+	delete data->m_workBuffer2;
+	delete data->m_constBuffer[0];
+	delete data->m_constBuffer[1];
+	delete data->m_constBuffer[2];
+	delete data->m_constBuffer[3];
+	
+	PrefixScan<type>::deallocate( data->m_scanData );
+
+	delete data;
+}
+
+template<DeviceType type>
+void RadixSortSimple<type>::execute(void* rawData, Buffer<SortData>& inout, int n, int sortBits)
+{
+	Data* data = (Data*)rawData;
+
+	ADLASSERT( sortBits == 32 );
+	ADLASSERT( n%512 == 0 );
+	ADLASSERT( n <= data->m_maxSize );
+
+	Buffer<SortData>* src = &inout;
+	Buffer<SortData>* dst = data->m_workBuffer2;
+
+	const Device* deviceData = data->m_deviceData;
+
+	int numGroups = (n+WG_SIZE*NUM_PER_WI-1)/(WG_SIZE*NUM_PER_WI);
+
+	int4 constBuffer;
+
+	int iPass = 0;
+	for(int startBit=0; startBit<32; startBit+=8, iPass++)
+	{
+		constBuffer.x = startBit;
+		constBuffer.y = numGroups;
+		constBuffer.z = WG_SIZE;
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( data->m_workBuffer0 ) };
+
+			Launcher launcher( deviceData, data->m_localCountKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
+			launcher.launch1D( WG_SIZE*numGroups, WG_SIZE );
+		}
+
+		PrefixScan<type>::execute( data->m_scanData, *data->m_workBuffer0, *data->m_workBuffer1, numGroups*256 );
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( dst ), BufferInfo( data->m_workBuffer1 ) };
+
+			Launcher launcher( deviceData, data->m_scatterKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
+			launcher.launch1D( WG_SIZE*numGroups, WG_SIZE );
+		}
+
+		swap2( src, dst );
+	}
+}
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/stringify.py
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/stringify.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+import sys
+import os
+import shutil
+
+arg = sys.argv[1]
+fh = open(arg)
+	
+print 'static const char* '+sys.argv[2]+'= \\'
+for line in fh.readlines():
+	a = line.strip('\n')
+	print '"'+a+'\\n"'
+print ';'
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/stringifykernels.bat
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/stringifykernels.bat
@@ -0,0 +1,22 @@
+stringify.py Fill/FillKernels.cl fillKernelsCL >Fill/FillKernelsCL.h
+stringify.py Fill/FillKernels.hlsl fillKernelsDX11 >Fill/FillKernelsDX11.h
+stringify.py Scan/PrefixScanKernels.cl prefixScanKernelsCL >Scan/PrefixScanKernelsCL.h
+stringify.py Scan/PrefixScanKernels.hlsl prefixScanKernelsDX11 >Scan/PrefixScanKernelsDX11.h
+stringify.py Search/BoundSearchKernels.cl boundSearchKernelsCL >Search/BoundSearchKernelsCL.h
+stringify.py Search/BoundSearchKernels.hlsl boundSearchKernelsDX11 >Search/BoundSearchKernelsDX11.h
+stringify.py Sort/RadixSortSimpleKernels.cl radixSortSimpleKernelsCL >Sort/RadixSortSimpleKernelsCL.h
+stringify.py Sort/RadixSortSimpleKernels.hlsl radixSortSimpleKernelsDX11 >Sort/RadixSortSimpleKernelsDX11.h
+stringify.py Sort/RadixSortStandardKernels.cl radixSortStandardKernelsCL >Sort/RadixSortStandardKernelsCL.h
+
+stringify.py Sort/RadixSort32Kernels.cl radixSort32KernelsCL >Sort/RadixSort32KernelsCL.h
+stringify.py Sort/RadixSort32Kernels.hlsl radixSort32KernelsDX11 >Sort/RadixSort32KernelsDX11.h
+
+stringify.py Copy/CopyKernels.cl copyKernelsCL >Copy/CopyKernelsCL.h
+stringify.py Copy/CopyKernels.hlsl copyKernelsDX11 >Copy/CopyKernelsDX11.h
+
+stringify.py Sort/RadixSortStandardKernels.hlsl radixSortStandardKernelsDX11 >Sort/RadixSortStandardKernelsDX11.h
+stringify.py Sort/RadixSortAdvancedKernels.hlsl radixSortAdvancedKernelsDX11 >Sort/RadixSortAdvancedKernelsDX11.h
+
+
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/AMD/premake4.lua
@@ -0,0 +1,31 @@
+	
+	hasCL = findOpenCL_AMD()
+	hasDX11 = findDirectX11()
+	
+	if (hasCL) then
+
+		project "OpenCL_DX11_primitives_test_AMD"
+
+		initOpenCL_AMD()
+
+		if (hasDX11) then
+			initDirectX11()
+		end
+		
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../../bin"
+		includedirs {"..","../.."}
+		
+		links {
+		"OpenCL"
+		}
+		
+		files {
+			"../main.cpp",
+			"../RadixSortBenchmark.h",
+			"../UnitTests.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/Intel/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/Intel/premake4.lua
@@ -0,0 +1,31 @@
+	
+	hasCL = findOpenCL_Intel()
+	hasDX11 = findDirectX11()
+	
+	if (hasCL) then
+
+		project "OpenCL_DX11_primitives_test_Intel"
+
+		initOpenCL_Intel()
+
+		if (hasDX11) then
+			initDirectX11()
+		end
+		
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../../bin"
+		includedirs {"..","../.."}
+		
+		links {
+		"OpenCL"
+		}
+		
+		files {
+			"../main.cpp",
+			"../RadixSortBenchmark.h",
+			"../UnitTests.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/LaunchOverheadBenchmark.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/LaunchOverheadBenchmark.h
@@ -0,0 +1,103 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#include <AdlPrimitives/Copy/Copy.h>
+
+
+
+template<DeviceType TYPE>
+__inline
+void copyTest( Device* device )
+{
+	int size = 65*1024;
+
+	Buffer<float4> buf0( device, size );
+	Buffer<float4> buf1( device, size );
+
+	Stopwatch sw( device );
+	
+	Copy<TYPE>::Data* data = Copy<TYPE>::allocate( device );
+
+	for(int i=0; i<10; i++)
+		Copy<TYPE>::execute( data, buf1, buf0, size, CopyBase::PER_WI_1 );
+	DeviceUtils::waitForCompletion( device );
+
+	{
+		const int nTests = 12;
+
+		float t[nTests];
+
+		for(int ii=0; ii<nTests; ii++)
+		{
+			int iter = 1<<ii;
+
+			DeviceUtils::waitForCompletion( device );
+			sw.start();
+			for(int i=0; i<iter; i++)
+			{
+				Copy<TYPE>::execute( data, buf1, buf0, size, CopyBase::PER_WI_1 );
+			}
+			DeviceUtils::waitForCompletion( device );
+			sw.stop();
+
+			t[ii] = sw.getMs()/(float)iter;
+		}
+
+		for(int ii=0; ii<nTests; ii++)
+		{
+			printf("%d:	%3.4fms	(%3.2fGB/s)\n", (1<<ii), t[ii], size*16*2/1024.f/1024.f/t[ii]);
+		}
+		printf("\n");
+
+	}
+	
+	Copy<TYPE>::deallocate( data );
+}
+
+void launchOverheadBenchmark()
+{
+	printf("LaunchOverheadBenchmark\n");
+
+
+	Device* ddcl;
+#if defined(ADL_ENABLE_DX11)
+	Device* dddx;
+#endif
+	{
+		DeviceUtils::Config cfg;
+		ddcl = DeviceUtils::allocate( TYPE_CL, cfg );
+#if defined(ADL_ENABLE_DX11)
+		dddx = DeviceUtils::allocate( TYPE_DX11, cfg );
+#endif
+	}
+
+	{
+		printf("CL\n");
+		copyTest<TYPE_CL>( ddcl );
+	}
+#ifdef ADL_ENABLE_DX11
+	{
+		printf("DX11\n");
+		copyTest<TYPE_DX11>( dddx );
+	}
+#endif
+
+
+}
+
+
+//1, 2, 4, 8, 16, 32, 64, 128, 256, 
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/NVIDIA/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/NVIDIA/premake4.lua
@@ -0,0 +1,31 @@
+	
+	hasCL = findOpenCL_NVIDIA()
+	hasDX11 = findDirectX11()
+	
+	if (hasCL) then
+
+		project "OpenCL_DX11_primitives_test_NVIDIA"
+
+		initOpenCL_NVIDIA()
+
+		if (hasDX11) then
+			initDirectX11()
+		end
+		
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../../bin"
+		includedirs {"..","../.."}
+		
+		links {
+		"OpenCL"
+		}
+		
+		files {
+			"../main.cpp",
+			"../RadixSortBenchmark.h",
+			"../UnitTests.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/RadixSortBenchmark.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/RadixSortBenchmark.h
@@ -0,0 +1,121 @@
+
+template<DeviceType TYPE>
+void run( Device* device, int minSize = 512, int maxSize = 64*1024 )//, int increment = 512 )
+{
+	ADLASSERT( TYPE == device->m_type );
+
+	Stopwatch sw( device );
+
+//	RadixSort<TYPE>::Data* data0 = RadixSort<TYPE>::allocate( device, maxSize, RadixSortBase::SORT_SIMPLE );
+	RadixSort<TYPE>::Data* data0 = RadixSort<TYPE>::allocate( device, maxSize, RadixSortBase::SORT_STANDARD );
+	RadixSort<TYPE>::Data* data1 = RadixSort<TYPE>::allocate( device, maxSize, RadixSortBase::SORT_STANDARD );
+	RadixSort<TYPE>::Data* data2 = RadixSort<TYPE>::allocate( device, maxSize, RadixSortBase::SORT_ADVANCED );
+
+	Buffer<SortData> buf0( device, maxSize );
+	Buffer<SortData> buf1( device, maxSize );
+	Buffer<SortData> buf2( device, maxSize );
+
+	SortData* input = new SortData[ maxSize ];
+
+//	for(int iter = minSize; iter<=maxSize; iter+=increment)
+	for(int iter = minSize; iter<=maxSize; iter*=2)
+	{
+		int size = NEXTMULTIPLEOF( iter, 512 );
+
+		for(int i=0; i<size; i++) input[i] = SortData( getRandom(0,0xff), i );
+
+		buf0.write( input, size );
+		buf1.write( input, size );
+		buf2.write( input, size );
+		DeviceUtils::waitForCompletion( device );
+
+
+		sw.start();
+
+		RadixSort<TYPE>::execute( data0, buf0, size );
+
+		sw.split();
+
+		RadixSort<TYPE>::execute( data1, buf1, size );
+
+		sw.split();
+
+		RadixSort<TYPE>::execute( data2, buf2, size );
+
+		sw.stop();
+
+
+		float t[3];
+		sw.getMs( t, 3 );
+//		printf("	%d	%3.2f	%3.2f	%3.2f\n", size, t[0], t[1], t[2]);
+		printf("	%d	%3.2f	%3.2f\n", size, t[1], t[2]);
+	}
+
+	RadixSort<TYPE>::deallocate( data0 );
+	RadixSort<TYPE>::deallocate( data1 );
+	RadixSort<TYPE>::deallocate( data2 );
+
+	delete [] input;
+}
+
+template<DeviceType TYPE>
+void run32( Device* device, int size )
+{
+	//Cayman: 4194.30Keys:    373.05MKeys/s
+	//Cypress: 4194.30Keys:    315.13MKeys/s
+	ADLASSERT( TYPE == device->m_type );
+
+	Stopwatch sw( device );
+
+	RadixSort32<TYPE>::Data* data = RadixSort32<TYPE>::allocate( device, size );
+	Copy<TYPE>::Data* copyData = Copy<TYPE>::allocate( device );
+
+	Buffer<u32> inputMaster( device, size );
+	Buffer<u32> input( device, size );
+	Buffer<u32> output( device, size );
+	{
+		u32* host = new u32[size];
+		for(int i=0; i<size; i++) host[i] = getRandom(0u, 0xffffffffu);
+		inputMaster.write( host, size );
+		DeviceUtils::waitForCompletion( device );
+		delete [] host;
+	}
+
+	int nIter = 100;
+	sw.start();
+	for(int iter=0; iter<nIter; iter++)
+	{
+//		Copy<TYPE>::execute( copyData, (Buffer<float>&)input, (Buffer<float>&)inputMaster, size );
+//		RadixSort32<TYPE>::execute( data, input, size );
+		RadixSort32<TYPE>::execute( data, input, output, size );
+	}
+	sw.stop();
+
+	{
+		float tInS = sw.getMs()/1000.f/(float)nIter;
+		float mKeysPerS = size/1000.f/1000.f/tInS;
+		printf("%3.2fMKeys:	%3.2fMKeys/s\n", size/1000.f, mKeysPerS);
+	}
+
+	RadixSort32<TYPE>::deallocate( data );
+	Copy<TYPE>::deallocate( copyData );
+}
+
+template<DeviceType TYPE>
+void radixSortBenchmark()
+{
+
+	Device* device;
+	{
+		DeviceUtils::Config cfg;
+		device = DeviceUtils::allocate( TYPE, cfg );
+	}
+
+	run32<TYPE>( device, 256*1024*8*2 );
+//	run32<TYPE>( device, 256*20*6 );
+
+//	run<TYPE>( device, 512, 1024*128*4 );
+
+	DeviceUtils::deallocate( device );
+
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/UnitTests.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/UnitTests.h
@@ -0,0 +1,801 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#include <AdlPrimitives/Scan/PrefixScan.h>
+#include <AdlPrimitives/Sort/RadixSort.h>
+#include <AdlPrimitives/Sort/RadixSort32.h>
+#include <AdlPrimitives/Search/BoundSearch.h>
+#include <AdlPrimitives/Fill/Fill.h>
+#include <AdlPrimitives/Copy/Copy.h>
+
+#include <time.h>
+
+using namespace adl;
+
+#define NUM_TESTS 10
+
+int g_nPassed = 0;
+int g_nFailed = 0;
+bool g_testFailed = 0;
+
+//#define TEST_INIT bool g_testFailed = 0;
+#define TEST_INIT g_testFailed = 0;
+#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
+//#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;ADLASSERT(x);}
+#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
+
+void memCpyTest( Device* deviceData )
+{
+	TEST_INIT;
+	int maxSize = 64*1024;
+	Buffer<u32> buff( deviceData, maxSize );
+
+	u32* hostBuff = new u32[maxSize];
+
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = getRandom( 1024, maxSize );
+
+		for(int i=0; i<size; i++) hostBuff[i] = i;
+
+		buff.write( hostBuff, size );
+
+		DeviceUtils::waitForCompletion( deviceData );
+		for(int i=0; i<size; i++) hostBuff[i] = 0;
+
+		buff.read( hostBuff, size );
+
+		DeviceUtils::waitForCompletion( deviceData );
+		for(int i=0; i<size; i++) TEST_ASSERT( hostBuff[i] == i );
+	}
+
+	delete [] hostBuff;
+	TEST_REPORT( "memCpyTest" );
+}
+
+void kernelTest( Device* deviceData )
+{
+	TEST_INIT;
+
+	KernelManager* manager = new KernelManager();
+
+	Kernel* kernel = manager->query(deviceData, ".\\Kernel", "VectorAddKernel" );
+
+	{
+		int size = 1024;
+		Buffer<int> buf0( deviceData, size );
+		Buffer<int> buf1( deviceData, size );
+		Buffer<float4> cBuf( deviceData, 1, BufferBase::BUFFER_CONST );
+		int* hostBuf0 = new int[size];
+		int* hostBuf1 = new int[size];
+		for(int i=0; i<size; i++) { hostBuf0[i] = i; hostBuf1[i] = 1; }
+		buf0.write( hostBuf0, size );
+		buf1.write( hostBuf1, size );
+		DeviceUtils::waitForCompletion( deviceData );
+
+		float4 constBuffer;
+		constBuffer.x = (float)size;
+		constBuffer.y = 2.f;
+		constBuffer.z = 0.f;
+		constBuffer.w = 0.f;
+		{
+			Launcher::BufferInfo bInfo[] = { Launcher::BufferInfo( (Buffer<float>*)&buf0 ), Launcher::BufferInfo( (Buffer<float>*)&buf1, true ) };
+
+			Launcher launcher( deviceData, kernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( (Buffer<float4>&)cBuf, constBuffer );
+			launcher.launch1D( size );
+
+			buf0.read( hostBuf0, size );
+			buf1.read( hostBuf1, size );
+			DeviceUtils::waitForCompletion( deviceData );
+		}
+
+		for(int i=0; i<size; i++) { TEST_ASSERT( hostBuf0[i] == i+1+2 ); }
+
+		delete [] hostBuf0;
+		delete [] hostBuf1;
+	}
+	TEST_REPORT( "kernelTest" );
+}
+
+void stopwatchTest( Device* deviceData )
+{
+	{
+		Stopwatch sw( deviceData );
+
+		sw.start();
+		Sleep(2);
+		sw.split();
+		Sleep(2);
+		sw.stop();
+
+		float t[2];
+		sw.getMs( t, 2 );
+	}
+}
+
+template<DeviceType type>
+void scanTest( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+
+	ADLASSERT( type == deviceGPU->m_type );
+
+	int maxSize = 1024*256;
+
+	HostBuffer<u32> buf0( deviceHost, maxSize );
+	HostBuffer<u32> buf1( deviceHost, maxSize );
+	Buffer<u32> buf2( deviceGPU, maxSize );
+	Buffer<u32> buf3( deviceGPU, maxSize );
+
+	PrefixScan<type>::Data* data0 = PrefixScan<type>::allocate( deviceGPU, maxSize );
+	PrefixScan<TYPE_HOST>::Data* data1 = PrefixScan<TYPE_HOST>::allocate( deviceHost, maxSize );
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = min2( 128+dx*iter, maxSize );
+
+		for(int i=0; i<size; i++) buf0[i] = 1;
+		buf2.write( buf0.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+
+		u32 sumHost, sumGPU;
+		PrefixScan<TYPE_HOST>::execute( data1, buf0, buf1, size, &sumHost );
+		PrefixScan<type>::execute( data0, buf2, buf3, size, &sumGPU );
+
+		buf3.read( buf0.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+		TEST_ASSERT( sumHost == sumGPU );
+		for(int i=0; i<size; i++) TEST_ASSERT( buf1[i] == buf0[i] );
+	}
+
+	PrefixScan<TYPE_HOST>::deallocate( data1 );
+	PrefixScan<type>::deallocate( data0 );
+
+	TEST_REPORT( "scanTest" );
+}
+
+template<DeviceType type, RadixSortBase::Option SORT_TYPE>
+bool radixSortTest( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+	ADLASSERT( type == deviceGPU->m_type );
+
+	int maxSize = 1024*256;
+
+	HostBuffer<SortData> buf0( deviceHost, maxSize );
+	HostBuffer<SortData> buf1( deviceHost, maxSize );
+	Buffer<SortData> buf2( deviceGPU, maxSize );
+
+	RadixSort<TYPE_HOST>::Data* dataH = RadixSort<TYPE_HOST>::allocate( deviceHost, maxSize, RadixSortBase::SORT_SIMPLE );
+	RadixSort<type>::Data* dataC = RadixSort<type>::allocate( deviceGPU, maxSize, SORT_TYPE );
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = min2( 128+dx*iter, maxSize-512 );
+		size = NEXTMULTIPLEOF( size, 512 );
+
+		for(int i=0; i<size; i++) buf0[i] = SortData( getRandom(0,0xff), i );
+		buf2.write( buf0.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+
+		RadixSort<TYPE_HOST>::execute( dataH, buf0, size );
+		RadixSort<type>::execute( dataC, buf2, size );
+
+		buf2.read( buf1.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+		for(int i=0; i<size; i++) TEST_ASSERT( buf0[i].m_value == buf1[i].m_value && buf0[i].m_key == buf1[i].m_key );
+	}
+
+	RadixSort<TYPE_HOST>::deallocate( dataH );
+	RadixSort<type>::deallocate( dataC );
+
+	return g_testFailed;
+}
+
+template<DeviceType type>
+void radixSortSimpleTest( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+	g_testFailed = radixSortTest<type, RadixSortBase::SORT_SIMPLE>(deviceGPU, deviceHost);
+	TEST_REPORT( "radixSortSimpleTest" );
+}
+
+template<DeviceType type>
+void radixSortStandardTest( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+	g_testFailed = radixSortTest<type, RadixSortBase::SORT_STANDARD>(deviceGPU, deviceHost);
+	TEST_REPORT( "radixSortStandardTest" );
+}
+
+template<DeviceType type>
+void radixSortAdvancedTest( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+	g_testFailed = radixSortTest<type, RadixSortBase::SORT_ADVANCED>(deviceGPU, deviceHost);
+	TEST_REPORT( "radixSortAdvancedTest" );
+}
+
+template<DeviceType type>
+void boundSearchTest( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+
+	ADLASSERT( type == deviceGPU->m_type );
+
+	int maxSize = 1024*256;
+	int bucketSize = 256;
+
+	HostBuffer<SortData> buf0( deviceHost, maxSize );
+	HostBuffer<u32> lowerH( deviceHost, maxSize );
+	HostBuffer<u32> upperH( deviceHost, maxSize );
+
+	Buffer<SortData> buf( deviceGPU, maxSize );
+	Buffer<u32> lower( deviceGPU, maxSize );
+	Buffer<u32> upper( deviceGPU, maxSize );
+
+	BoundSearch<type>::Data* dataH = BoundSearch<type>::allocate( deviceGPU );
+	RadixSort<TYPE_HOST>::Data* dataHSort = RadixSort<TYPE_HOST>::allocate( deviceHost, maxSize, RadixSortBase::SORT_SIMPLE );
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = min2( 128+dx*iter, maxSize );
+		for(int i=0; i<size; i++) buf0[i] = SortData( getRandom(0,bucketSize), i );
+		RadixSort<TYPE_HOST>::execute( dataHSort, buf0, size );
+		buf.write( buf0.m_ptr, size );
+		{
+			u32* host = new u32[size];
+			for(int i=0; i<size; i++) host[i] = -1;
+			lower.write( host, size );
+			upper.write( host, size );
+		}
+		DeviceUtils::waitForCompletion( deviceGPU );
+
+		BoundSearch<type>::execute( dataH, buf, size, lower, bucketSize, BoundSearchBase::BOUND_LOWER );
+		BoundSearch<type>::execute( dataH, buf, size, upper, bucketSize, BoundSearchBase::BOUND_UPPER );
+
+		lower.read( lowerH.m_ptr, bucketSize );
+		upper.read( upperH.m_ptr, bucketSize );
+		DeviceUtils::waitForCompletion( deviceGPU );
+/*
+		for(u32 i=1; i<(u32)bucketSize; i++)
+		{
+			for(u32 j=lowerH[i-1]; j<lowerH[i]; j++)
+			{
+				TEST_ASSERT( buf0[j].m_key < i );
+			}
+		}
+
+		for(u32 i=0; i<(u32)bucketSize; i++)
+		{
+			int jMin = (i==0)?0:upperH[i-1];
+			for(u32 j=jMin; j<upperH[i]; j++)
+			{
+				TEST_ASSERT( buf0[j].m_key <= i );
+			}
+		}
+*/
+		for(u32 i=0; i<(u32)bucketSize; i++)
+		{
+			for(u32 j=lowerH[i]; j<upperH[i]; j++)
+			{
+				if ( buf0[j].m_key != i )
+				{
+					printf("error %d != %d\n",buf0[j].m_key,i);
+				}
+				TEST_ASSERT( buf0[j].m_key == i );
+			}
+		}
+
+	}
+	
+	BoundSearch<type>::deallocate( dataH );
+	RadixSort<TYPE_HOST>::deallocate( dataHSort );
+
+	TEST_REPORT( "boundSearchTest" );
+}
+
+template<DeviceType type>
+void fillIntTest( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+
+	ADLASSERT( type == deviceGPU->m_type );
+
+	int maxSize = 1024*256;
+
+	HostBuffer<int> buf0( deviceHost, maxSize );
+	HostBuffer<int> buf1( deviceHost, maxSize );
+	Buffer<int> buf2( deviceGPU, maxSize );
+
+	Fill<TYPE_HOST>::Data* data0 = Fill<TYPE_HOST>::allocate( deviceHost );
+	Fill<type>::Data* data1 = Fill<type>::allocate( deviceGPU );
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = min2( 128+dx*iter, maxSize );
+		for(int i=0; i<size; i++) buf0[i] = -1;
+		buf2.write( buf0.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+
+		Fill<TYPE_HOST>::execute( data0, buf0, 12, size );
+		Fill<type>::execute( data1, buf2, 12, size );
+
+		buf2.read( buf1.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+		for(int i=0; i<size; i++) TEST_ASSERT( buf1[i] == buf0[i] );
+	}
+
+	Fill<TYPE_HOST>::deallocate( data0 );
+	Fill<type>::deallocate( data1 );
+
+	TEST_REPORT( "fillIntTest" );
+}
+
+template<DeviceType type>
+void fillInt2Test( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+
+	ADLASSERT( type == deviceGPU->m_type );
+
+	int maxSize = 1024*256;
+
+	HostBuffer<int2> buf0( deviceHost, maxSize );
+	HostBuffer<int2> buf1( deviceHost, maxSize );
+	Buffer<int2> buf2( deviceGPU, maxSize );
+
+	Fill<TYPE_HOST>::Data* data0 = Fill<TYPE_HOST>::allocate( deviceHost );
+	Fill<type>::Data* data1 = Fill<type>::allocate( deviceGPU );
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = min2( 128+dx*iter, maxSize );
+		for(int i=0; i<size; i++) buf0[i] = make_int2( -1, -1 );
+		buf2.write( buf0.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+
+		Fill<TYPE_HOST>::execute( data0, buf0, make_int2( 12, 12 ), size );
+		Fill<type>::execute( data1, buf2, make_int2( 12, 12 ), size );
+
+		buf2.read( buf1.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+		for(int i=0; i<size; i++) TEST_ASSERT( buf1[i] == buf0[i] );
+	}
+
+	Fill<TYPE_HOST>::deallocate( data0 );
+	Fill<type>::deallocate( data1 );
+
+	TEST_REPORT( "fillInt2Test" );
+}
+
+template<DeviceType type>
+void fillInt4Test( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+
+	ADLASSERT( type == deviceGPU->m_type );
+
+	int maxSize = 1024*256;
+
+	HostBuffer<int4> buf0( deviceHost, maxSize );
+	HostBuffer<int4> buf1( deviceHost, maxSize );
+	Buffer<int4> buf2( deviceGPU, maxSize );
+
+	Fill<TYPE_HOST>::Data* data0 = Fill<TYPE_HOST>::allocate( deviceHost );
+	Fill<type>::Data* data1 = Fill<type>::allocate( deviceGPU );
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = min2( 128+dx*iter, maxSize );
+		for(int i=0; i<size; i++) buf0[i] = make_int4( -1 );
+		buf2.write( buf0.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+
+		Fill<TYPE_HOST>::execute( data0, buf0, make_int4( 12 ), size );
+		Fill<type>::execute( data1, buf2, make_int4( 12 ), size );
+
+		buf2.read( buf1.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+		for(int i=0; i<size; i++) TEST_ASSERT( buf1[i] == buf0[i] );
+	}
+
+	Fill<TYPE_HOST>::deallocate( data0 );
+	Fill<type>::deallocate( data1 );
+
+	TEST_REPORT( "fillInt4Test" );
+}
+
+
+template<DeviceType type, CopyBase::Option OPTION>
+bool CopyF4Test( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+	ADLASSERT( type == deviceGPU->m_type );
+
+	int maxSize = 1024*256;
+
+	HostBuffer<float4> buf0( deviceHost, maxSize );
+	HostBuffer<float4> buf1( deviceHost, maxSize );
+	Buffer<float4> buf2( deviceGPU, maxSize );
+	Buffer<float4> buf3( deviceGPU, maxSize );
+	HostBuffer<float4> devResult( deviceHost, maxSize );
+
+	Copy<TYPE_HOST>::Data* data0 = Copy<TYPE_HOST>::allocate( deviceHost );
+	Copy<type>::Data* data1 = Copy<type>::allocate( deviceGPU );
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = min2( 128+dx*iter, maxSize-4 );
+		size = NEXTMULTIPLEOF( size, 4 );
+		float r = 10000.f;
+		for(int i=0; i<size; i++) buf0[i] = make_float4( getRandom( -r, r ), getRandom( -r, r ), getRandom( -r, r ), getRandom( -r, r ) );
+		buf2.write( buf0.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+
+		Copy<TYPE_HOST>::execute( data0, buf1, buf0, size, OPTION );
+		Copy<type>::execute( data1, buf3, buf2, size, OPTION );
+
+		buf3.read( devResult.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+		for(int i=0; i<size; i++)
+		{
+			TEST_ASSERT( buf1[i] == devResult[i] );
+			TEST_ASSERT( buf0[i] == devResult[i] );
+		}
+	}
+
+	Copy<TYPE_HOST>::deallocate( data0 );
+	Copy<type>::deallocate( data1 );
+
+	return g_testFailed;
+}
+
+template<DeviceType type>
+void Copy1F4Test( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+	g_testFailed = CopyF4Test<type, CopyBase::PER_WI_1>( deviceGPU, deviceHost );
+	TEST_REPORT( "Copy1F4Test" );
+}
+
+template<DeviceType type>
+void Copy2F4Test( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+	g_testFailed = CopyF4Test<type, CopyBase::PER_WI_2>( deviceGPU, deviceHost );
+	TEST_REPORT( "Copy2F4Test" );
+}
+
+template<DeviceType type>
+void Copy4F4Test( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+	g_testFailed = CopyF4Test<type, CopyBase::PER_WI_4>( deviceGPU, deviceHost );
+	TEST_REPORT( "Copy4F4Test" );
+}
+
+
+template<DeviceType type>
+void CopyF1Test( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+	ADLASSERT( type == deviceGPU->m_type );
+
+	int maxSize = 1024*256;
+
+	HostBuffer<float> buf0( deviceHost, maxSize );
+	HostBuffer<float> buf1( deviceHost, maxSize );
+	Buffer<float> buf2( deviceGPU, maxSize );
+	Buffer<float> buf3( deviceGPU, maxSize );
+	HostBuffer<float> devResult( deviceHost, maxSize );
+
+	Copy<TYPE_HOST>::Data* data0 = Copy<TYPE_HOST>::allocate( deviceHost );
+	Copy<type>::Data* data1 = Copy<type>::allocate( deviceGPU );
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = min2( 128+dx*iter, maxSize-4 );
+		size = NEXTMULTIPLEOF( size, 4 );
+		float r = 10000.f;
+		for(int i=0; i<size; i++) buf0[i] = getRandom( -r, r );
+		buf2.write( buf0.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+
+		Copy<TYPE_HOST>::execute( data0, buf1, buf0, size );
+		Copy<type>::execute( data1, buf3, buf2, size );
+
+		buf3.read( devResult.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+		for(int i=0; i<size; i++)
+		{
+			TEST_ASSERT( buf1[i] == devResult[i] );
+			TEST_ASSERT( buf0[i] == devResult[i] );
+		}
+	}
+
+	Copy<TYPE_HOST>::deallocate( data0 );
+	Copy<type>::deallocate( data1 );
+
+	TEST_REPORT( "CopyF1Test" );
+}
+
+template<DeviceType type>
+void CopyF2Test( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+	ADLASSERT( type == deviceGPU->m_type );
+
+	int maxSize = 1024*256;
+
+	HostBuffer<float2> buf0( deviceHost, maxSize );
+	HostBuffer<float2> buf1( deviceHost, maxSize );
+	Buffer<float2> buf2( deviceGPU, maxSize );
+	Buffer<float2> buf3( deviceGPU, maxSize );
+	HostBuffer<float2> devResult( deviceHost, maxSize );
+
+	Copy<TYPE_HOST>::Data* data0 = Copy<TYPE_HOST>::allocate( deviceHost );
+	Copy<type>::Data* data1 = Copy<type>::allocate( deviceGPU );
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = min2( 128+dx*iter, maxSize-4 );
+		size = NEXTMULTIPLEOF( size, 4 );
+		float r = 10000.f;
+		for(int i=0; i<size; i++) buf0[i] = make_float2( getRandom( -r, r ), getRandom( -r, r ) );
+		buf2.write( buf0.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+
+		Copy<TYPE_HOST>::execute( data0, buf1, buf0, size );
+		Copy<type>::execute( data1, buf3, buf2, size );
+
+		buf3.read( devResult.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+		for(int i=0; i<size; i++)
+		{
+			TEST_ASSERT( buf1[i] == devResult[i] );
+			TEST_ASSERT( buf0[i] == devResult[i] );
+		}
+	}
+
+	Copy<TYPE_HOST>::deallocate( data0 );
+	Copy<type>::deallocate( data1 );
+
+	TEST_REPORT( "CopyF2Test" );
+}
+
+template<DeviceType type>
+void radixSort32Test( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+	ADLASSERT( type == deviceGPU->m_type );
+
+	int maxSize = 1024*256;
+
+	HostBuffer<u32> buf0( deviceHost, maxSize );
+	HostBuffer<u32> buf1( deviceHost, maxSize );
+	Buffer<u32> buf2( deviceGPU, maxSize );
+
+	RadixSort32<TYPE_HOST>::Data* dataH = RadixSort32<TYPE_HOST>::allocate( deviceHost, maxSize );
+	RadixSort32<type>::Data* dataC = RadixSort32<type>::allocate( deviceGPU, maxSize );
+
+	int dx = maxSize/NUM_TESTS;
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = min2( 128+dx*iter, maxSize-512 );
+		size = NEXTMULTIPLEOF( size, 512 );
+
+		for(int i=0; i<size; i++) buf0[i] = getRandom(0u,0xffffffffu);
+		buf2.write( buf0.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+
+		RadixSort32<TYPE_HOST>::execute( dataH, buf0, size, 32 );
+		RadixSort32<type>::execute( dataC, buf2, size, 32 );
+
+		buf2.read( buf1.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+//		for(int i=0; i<size-1; i++) TEST_ASSERT( buf1[i] <= buf1[i+1] );
+		for(int i=0; i<size; i++) TEST_ASSERT( buf0[i] == buf1[i] );
+	}
+
+	RadixSort32<TYPE_HOST>::deallocate( dataH );
+	RadixSort32<type>::deallocate( dataC );
+
+	TEST_REPORT( "RadixSort32Test" );
+}
+
+template<DeviceType type>
+void radixSortKeyValue32Test( Device* deviceGPU, Device* deviceHost )
+{
+	TEST_INIT;
+	ADLASSERT( type == deviceGPU->m_type );
+
+	int maxSize = 1024*256;
+	
+	// Host buffers
+	HostBuffer<u32> buf0( deviceHost, maxSize ); // Buffer for keys in host and will be sorted by host.
+	HostBuffer<u32> buf1( deviceHost, maxSize ); // Buffer for keys in host and will be saved by device after sorting in device.  
+	HostBuffer<u32> buf2( deviceHost, maxSize ); // Buffer for values in host. This buffer is paired with buf0.
+	HostBuffer<u32> buf3( deviceHost, maxSize ); // Buffer for values in host and will be saved by device after sorting. It is paired with buf1.
+	
+	// Device buffers
+	Buffer<u32> buf4( deviceGPU, maxSize ); // Buffer for input keys for device.
+	Buffer<u32> buf5( deviceGPU, maxSize ); // Buffer for output keys from device and will be sorted by device. This key data will be saved to buf1 to be compared with a result(buf0) from host.
+	Buffer<u32> buf6( deviceGPU, maxSize ); // Buffer for input values in device.
+	Buffer<u32> buf7( deviceGPU, maxSize ); // Buffer for output values in device.
+
+	RadixSort32<TYPE_HOST>::Data* dataH = RadixSort32<TYPE_HOST>::allocate( deviceHost, maxSize );
+	RadixSort32<type>::Data* dataC = RadixSort32<type>::allocate( deviceGPU, maxSize );
+
+	int dx = maxSize/NUM_TESTS;
+
+	for(int iter=0; iter<NUM_TESTS; iter++)
+	{
+		int size = min2( 128+dx*iter, maxSize-512 );
+		size = NEXTMULTIPLEOF( size, 512 );
+
+		// keys
+		seedRandom((int)time(NULL)/2);
+		for(int i=0; i<size; i++) buf0[i] = getRandom(0u,0xffffffffu);
+		buf4.write( buf0.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+
+		// values
+		seedRandom((int)time(NULL)/2);
+		for(int i=0; i<size; i++) buf2[i] = getRandom(0u,0xffffffffu);
+		buf6.write( buf2.m_ptr, size );
+		DeviceUtils::waitForCompletion( deviceGPU );
+
+		RadixSort32<TYPE_HOST>::execute( dataH, buf0, buf2, size, 32 );
+		RadixSort32<type>::execute( dataC, buf4, buf5, buf6, buf7, size, 32 );
+		buf5.read( buf1.m_ptr, size );
+		buf7.read( buf3.m_ptr, size );
+
+		DeviceUtils::waitForCompletion( deviceGPU );
+		
+		for(int i=0; i<size; i++) 
+		{
+			// Comparing keys. One is done by Host and the other is done by Device.
+			TEST_ASSERT( buf0[i] == buf1[i] );
+
+			// Comparing values. One is done by Host and the other is done by Device.
+			TEST_ASSERT( buf2[i] == buf3[i] );
+		}
+	}
+
+	RadixSort32<TYPE_HOST>::deallocate( dataH );
+	RadixSort32<type>::deallocate( dataC );
+
+	TEST_REPORT( "RadixSortKeyValue32Test" );
+}
+
+#if defined(ADL_ENABLE_DX11)
+	#define RUN_GPU( func ) func(ddcl); func(dddx);
+	#define RUN_GPU_TEMPLATE( func ) func<TYPE_CL>( ddcl, ddhost ); func<TYPE_DX11>( dddx, ddhost );
+	#define RUN_CL_TEMPLATE( func ) func<TYPE_CL>( ddcl, ddhost );
+#else
+	#define RUN_GPU( func ) func(ddcl);
+	#define RUN_GPU_TEMPLATE( func ) func<TYPE_CL>( ddcl, ddhost ); 
+#endif
+#define RUN_ALL( func ) RUN_GPU( func ); func(ddhost);
+
+void runAllTest()
+{
+	g_nPassed = 0;
+	g_nFailed = 0;
+
+	
+	Device* ddcl;
+	Device* ddhost;
+#if defined(ADL_ENABLE_DX11)
+	Device* dddx;
+#endif
+
+	{
+		DeviceUtils::Config cfg;
+
+				// Choose AMD or NVidia
+#ifdef CL_PLATFORM_AMD
+		cfg.m_vendor = adl::DeviceUtils::Config::VD_AMD;
+#endif
+
+#ifdef CL_PLATFORM_INTEL
+		cfg.m_vendor = adl::DeviceUtils::Config::VD_INTEL;
+		cfg.m_type = DeviceUtils::Config::DEVICE_CPU;
+#endif
+		
+
+#ifdef CL_PLATFORM_NVIDIA
+		cfg.m_vendor = adl::DeviceUtils::Config::VD_NV;
+#endif
+
+
+		ddcl = DeviceUtils::allocate( TYPE_CL, cfg );
+		ddhost = DeviceUtils::allocate( TYPE_HOST, cfg );
+//		cfg.m_type = DeviceUtils::Config::DEVICE_GPU;
+#if defined(ADL_ENABLE_DX11)
+		dddx = DeviceUtils::allocate( TYPE_DX11, cfg );
+#endif
+	}
+
+	{
+		char name[128];
+		ddcl->getDeviceName( name );
+		printf("CL: %s\n", name);
+#ifdef ADL_ENABLE_DX11
+		dddx->getDeviceName( name );
+		printf("DX11: %s\n", name);
+#endif
+	}
+
+	RUN_GPU_TEMPLATE( radixSort32Test );
+	RUN_GPU_TEMPLATE( radixSortKeyValue32Test );
+
+	if (1)
+	{
+		RUN_GPU_TEMPLATE( CopyF1Test );
+		RUN_GPU_TEMPLATE( CopyF2Test );
+
+		boundSearchTest<TYPE_HOST>( ddhost, ddhost );
+//		fillTest<TYPE_HOST>( ddhost, ddhost );
+//		fillTest<TYPE_CL>( ddcl, ddhost );
+
+
+	
+
+		RUN_GPU_TEMPLATE( boundSearchTest );
+
+		RUN_GPU_TEMPLATE( fillIntTest );
+		RUN_GPU_TEMPLATE( fillInt2Test );
+		RUN_GPU_TEMPLATE( fillInt4Test );
+
+		RUN_ALL( stopwatchTest );
+		RUN_ALL( memCpyTest );
+//		RUN_GPU( kernelTest );
+		RUN_GPU_TEMPLATE( scanTest );
+		RUN_GPU_TEMPLATE( radixSortSimpleTest );
+
+		RUN_GPU_TEMPLATE( radixSortStandardTest );
+
+		RUN_GPU_TEMPLATE( radixSort32Test );
+		
+//		RUN_GPU_TEMPLATE( boundSearchTest );
+		RUN_GPU_TEMPLATE( Copy1F4Test );
+		RUN_GPU_TEMPLATE( Copy2F4Test );
+		RUN_GPU_TEMPLATE( Copy4F4Test );
+	}
+
+	DeviceUtils::deallocate( ddcl );
+	DeviceUtils::deallocate( ddhost );
+#if defined(ADL_ENABLE_DX11)
+	DeviceUtils::deallocate( dddx );
+#endif
+
+	printf("=========\n%d Passed\n%d Failed\n", g_nPassed, g_nFailed);
+
+
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/main.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/main.cpp
@@ -0,0 +1,118 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#include <stdio.h>
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+
+#include "UnitTests.h"
+#include "RadixSortBenchmark.h"
+#include "LaunchOverheadBenchmark.h"
+
+
+#undef NUM_TESTS
+
+
+struct ConstBuffer
+{
+	float4 m_a;
+	float4 m_b;
+	float4 m_c;
+};
+
+int main()
+{
+	if(0)
+	{	//	radix sort test
+		Device* deviceHost;
+		Device* deviceGPU;
+		{
+			DeviceUtils::Config cfg;
+
+		// Choose AMD or NVidia
+#ifdef CL_PLATFORM_AMD
+	cfg.m_vendor = DeviceUtils::Config::VD_AMD;
+#endif
+
+#ifdef CL_PLATFORM_INTEL
+	cfg.m_vendor = DeviceUtils::Config::VD_INTEL;
+#endif
+
+#ifdef CL_PLATFORM_NVIDIA
+	cfg.m_vendor = adl::DeviceUtils::Config::VD_NV;
+#endif
+			deviceGPU = DeviceUtils::allocate( TYPE_DX11, cfg );
+			deviceHost = DeviceUtils::allocate( TYPE_HOST, cfg );
+		}
+
+		{
+		int maxSize = 512*20;
+		int size = maxSize;
+
+		HostBuffer<SortData> buf0( deviceHost, maxSize );
+		HostBuffer<SortData> buf1( deviceHost, maxSize );
+		Buffer<SortData> buf2( deviceGPU, maxSize );
+
+		RadixSort<TYPE_HOST>::Data* dataH = RadixSort<TYPE_HOST>::allocate( deviceHost, maxSize, RadixSortBase::SORT_STANDARD );
+		RadixSort<TYPE_DX11>::Data* dataC = RadixSort<TYPE_DX11>::allocate( deviceGPU, maxSize, RadixSortBase::SORT_ADVANCED );
+
+		{
+			size = NEXTMULTIPLEOF( size, 512 );
+
+			for(int i=0; i<size; i++) buf0[i] = SortData( getRandom(0,0xfff), i );
+			buf2.write( buf0.m_ptr, size );
+			DeviceUtils::waitForCompletion( deviceGPU );
+
+			RadixSort<TYPE_HOST>::execute( dataH, buf0, size );
+			RadixSort<TYPE_DX11>::execute( dataC, buf2, size );
+
+			buf2.read( buf1.m_ptr, size );
+			DeviceUtils::waitForCompletion( deviceGPU );
+			for(int i=0; i<size; i++) ADLASSERT( buf0[i].m_value == buf1[i].m_value && buf0[i].m_key == buf1[i].m_key );
+		}
+
+		RadixSort<TYPE_HOST>::deallocate( dataH );
+		RadixSort<TYPE_DX11>::deallocate( dataC );
+		}
+
+		DeviceUtils::deallocate( deviceHost );
+		DeviceUtils::deallocate( deviceGPU );
+	}
+
+	if(0)
+	{
+		launchOverheadBenchmark();
+	}
+
+	if(0)
+	{
+		radixSortBenchmark<TYPE_DX11>();
+	}
+
+	if(0)
+	{
+		radixSortBenchmark<TYPE_CL>();
+	}
+
+	if(1)
+	{
+		runAllTest();
+	}
+	printf("End, press <enter>\n");
+	getchar();
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlTest/premake4.lua
@@ -0,0 +1,4 @@
+
+include "AMD"
+include "NVIDIA"
+include "Intel"
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/benchmark/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/benchmark/AMD/premake4.lua
@@ -0,0 +1,29 @@
+	
+	hasCL = findOpenCL_AMD()
+	hasDX11 = findDirectX11()
+	
+	if (hasCL) then
+
+		project "OpenCL_DX11_radixsort_benchmark_AMD"
+
+		initOpenCL_AMD()
+
+		if (hasDX11) then
+			initDirectX11()
+		end
+		
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../../bin"
+		includedirs {"..","../.."}
+		
+		links {
+		"OpenCL"
+		}
+		
+		files {
+			"../test_large_problem_sorting.cpp"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/benchmark/NVIDIA/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/benchmark/NVIDIA/premake4.lua
@@ -0,0 +1,29 @@
+	
+	hasCL = findOpenCL_NVIDIA()
+	hasDX11 = findDirectX11()
+	
+	if (hasCL) then
+
+		project "OpenCL_DX11_radixsort_benchmark_NVIDIA"
+
+		initOpenCL_NVIDIA()
+
+		if (hasDX11) then
+			initDirectX11()
+		end
+		
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../../bin"
+		includedirs {"..","../.."}
+		
+		links {
+		"OpenCL"
+		}
+		
+		files {
+			"../test_large_problem_sorting.cpp"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/benchmark/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/benchmark/premake4.lua
@@ -0,0 +1,2 @@
+include "AMD"
+include "NVIDIA"
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/benchmark/test_large_problem_sorting.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/benchmark/test_large_problem_sorting.cpp
@@ -0,0 +1,705 @@
+/******************************************************************************
+ * Copyright 2010 Duane Merrill
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. 
+ * 
+ * 
+ * 
+ * 
+ * AUTHORS' REQUEST: 
+ * 
+ * 		If you use|reference|benchmark this code, please cite our Technical 
+ * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
+ * 
+ *		@TechReport{ Merrill:Sorting:2010,
+ *        	author = "Duane Merrill and Andrew Grimshaw",
+ *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
+ *        	year = "2010",
+ *        	institution = "University of Virginia, Department of Computer Science",
+ *        	address = "Charlottesville, VA, USA",
+ *        	number = "CS2010-03"
+ *		}
+ * 
+ * For more information, see our Google Code project site: 
+ * http://code.google.com/p/back40computing/
+ * 
+ * Thanks!
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple test driver program for *large-problem* radix sorting.
+ *
+ * Useful for demonstrating how to integrate radix sorting into 
+ * your application 
+ ******************************************************************************/
+
+/******************************************************************************
+ * Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans
+ ******************************************************************************/
+
+#include <stdlib.h> 
+#include <stdio.h> 
+#include <string.h> 
+#include <math.h> 
+#include <float.h>
+#include <algorithm>
+#include <string>
+
+#define BUFFERSIZE_WORKAROUND
+
+//#include <iostream>
+#include <sstream>
+/**********************
+*
+*/
+
+#include "Adl/Adl.h"
+#include "AdlPrimitives/Sort/RadixSort32.h"
+#include "AdlPrimitives/Sort/SortData.h"
+
+using namespace adl;
+
+
+/***********************
+*
+*/
+
+bool g_verbose;
+
+
+/******************************************************************************
+ * Routines
+ ******************************************************************************/
+
+
+/**
+ * Keys-only sorting.  Uses the GPU to sort the specified vector of elements for the given 
+ * number of iterations, displaying runtime information.
+ *
+ * @param[in] 		num_elements 
+ * 		Size in elements of the vector to sort
+ * @param[in] 		h_keys 
+ * 		Vector of keys to sort 
+ * @param[in] 		iterations  
+ * 		Number of times to invoke the GPU sorting primitive
+  * @param[in] 		cfg 
+ * 		Config
+ */
+template <typename K, DeviceType type>
+void TimedSort(
+	unsigned int num_elements, 
+	K *h_keys,
+	unsigned int iterations, const DeviceUtils::Config& cfg)
+{
+	std::string sType = "No type selected";
+	
+	if (type == TYPE_CL)
+		sType = "OpenCL";
+	else if (type == TYPE_DX11)
+		sType = "DX11";
+
+	printf("Keys-only, %s, %d iterations, %d elements\n", sType.c_str(), iterations, num_elements);
+
+	int max_elements = num_elements;
+
+#ifdef BUFFERSIZE_WORKAROUND
+	if (max_elements < 1024*256)
+		max_elements = 1024*256;
+#endif
+	
+	// Allocate device storage
+	Device* deviceData = NULL;
+
+	if ( type == TYPE_CL )
+		deviceData = new DeviceCL();
+#ifdef ADL_ENABLE_DX11
+	else if ( type == TYPE_DX11 )
+		deviceData = new DeviceDX11();
+#endif //ADL_ENABLE_DX11
+
+	deviceData->initialize(cfg);
+
+	RadixSort32<type>::Data* planData = RadixSort32<type>::allocate( deviceData, max_elements);
+
+	{
+		Buffer<unsigned int>	keysInOut(deviceData,max_elements);
+		
+		// Create sorting enactor
+		keysInOut.write(h_keys,num_elements);
+		DeviceUtils::waitForCompletion( deviceData);
+		
+		RadixSort32<type>::execute( planData,keysInOut,num_elements,  32);
+		DeviceUtils::waitForCompletion( deviceData);
+
+		// Perform the timed number of sorting iterations
+		double elapsed = 0;
+		float duration = 0;
+		StopwatchHost watch;
+		watch.init(deviceData);
+
+		watch.start();
+			
+		for (int i = 0; i < iterations; i++) 
+		{
+
+			// Move a fresh copy of the problem into device storage
+			keysInOut.write(h_keys,num_elements);
+			DeviceUtils::waitForCompletion( deviceData);
+
+			// Start GPU timing record
+			watch.start();
+			
+			// Call the sorting API routine
+			RadixSort32<type>::execute( planData,keysInOut,num_elements,  32);
+			DeviceUtils::waitForCompletion( deviceData);
+		
+			watch.stop();
+			duration = watch.getMs();
+
+			// End GPU timing record
+			elapsed += (double) duration;
+		}
+
+		// Display timing information
+		double avg_runtime = elapsed / iterations;
+	//	double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0; 
+	//   printf(", %f GPU ms, %f x10^9 elts/sec\n", 	avg_runtime,	throughput);
+		double throughput = ((double) num_elements) / avg_runtime / 1000.0 ; 
+		printf(", %f GPU ms, %f x10^6 elts/sec\n", 	avg_runtime,	throughput);
+
+		// Copy out data 
+		keysInOut.read(h_keys,num_elements);
+		
+		DeviceUtils::waitForCompletion( deviceData);
+
+	}
+	// Free allocated memory
+	RadixSort32<type>::deallocate( planData);
+    delete deviceData;
+    // Clean up events
+}
+
+/**
+ * Key-value sorting.  Uses the GPU to sort the specified vector of elements for the given 
+ * number of iterations, displaying runtime information.
+ *
+ * @param[in] 		num_elements 
+ * 		Size in elements of the vector to sort
+ * @param[in] 		h_keys 
+ * 		Vector of keys to sort 
+ * @param[in,out] 	h_values  
+ * 		Vector of values to sort 
+ * @param[in] 		iterations  
+ * 		Number of times to invoke the GPU sorting primitive
+  * @param[in] 		cfg 
+ * 		Config
+ */
+template <typename K, typename V, DeviceType type>
+void TimedSort(
+	unsigned int num_elements, 
+	K *h_keys,
+	V *h_values, 
+	unsigned int iterations, const DeviceUtils::Config& cfg) 
+{
+	std::string sType = "No type selected";
+	
+	if (type == TYPE_CL)
+		sType = "OpenCL";
+	else if (type == TYPE_DX11)
+		sType = "DX11";
+
+	printf("Key-values, %s, %d iterations, %d elements\n", sType.c_str(), iterations, num_elements);
+
+	int max_elements = num_elements;
+
+#ifdef BUFFERSIZE_WORKAROUND
+	if (max_elements < 1024*256)
+		max_elements = 1024*256;
+#endif
+	
+	// Allocate device storage
+	Device* deviceData = NULL;
+
+	if ( type == TYPE_CL )
+		deviceData = new DeviceCL();
+#ifdef ADL_ENABLE_DX11
+	else if ( type == TYPE_DX11 )
+		deviceData = new DeviceDX11();
+#endif //ADL_ENABLE_DX11
+
+	deviceData->initialize(cfg);
+	RadixSort32<type>::Data* planData = RadixSort32<type>::allocate( deviceData, max_elements);
+	{
+		Buffer<unsigned int>	keysIn(deviceData,max_elements);
+		Buffer<unsigned int>	valuesIn(deviceData,max_elements);
+
+		Buffer<unsigned int>	keysOut(deviceData,max_elements);
+		Buffer<unsigned int>	valuesOut(deviceData,max_elements);
+
+		//printf("Key-values, %d iterations, %d elements", iterations, num_elements);
+
+		// Create sorting enactor
+		keysIn.write(h_keys,num_elements);
+		DeviceUtils::waitForCompletion( deviceData);
+		valuesIn.write(h_values,num_elements);
+		DeviceUtils::waitForCompletion( deviceData);
+
+		
+		// Perform a single sorting iteration to allocate memory, prime code caches, etc.
+		//RadixSort<type>::execute( planData, buffer,  num_elements );
+
+		//RadixSort32<type>::execute( planData, keysIn,keysOut, valuesIn,valuesOut, num_elements,  32);
+		RadixSort32<type>::execute( planData, keysIn,keysOut, valuesIn,valuesOut, num_elements,  32);
+		DeviceUtils::waitForCompletion( deviceData);
+
+		// Perform the timed number of sorting iterations
+		double elapsed = 0;
+		float duration = 0;
+		StopwatchHost watch;
+		watch.init(deviceData);
+
+		watch.start();
+			
+		for (int i = 0; i < iterations; i++) 
+		{
+
+			// Move a fresh copy of the problem into device storage
+			keysIn.write(h_keys,num_elements);
+			valuesIn.write(h_values,num_elements);
+
+			DeviceUtils::waitForCompletion( deviceData);
+
+			// Start GPU timing record
+			watch.start();
+			
+			// Call the sorting API routine
+			
+			RadixSort32<type>::execute( planData, keysIn,keysOut, valuesIn,valuesOut, num_elements,  32);
+
+			DeviceUtils::waitForCompletion( deviceData);
+		
+			watch.stop();
+			duration = watch.getMs();
+
+			// End GPU timing record
+			elapsed += (double) duration;
+		}
+
+		// Display timing information
+		double avg_runtime = elapsed / iterations;
+	//	double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0; 
+	//   printf(", %f GPU ms, %f x10^9 elts/sec\n", 	avg_runtime,	throughput);
+		double throughput = ((double) num_elements) / avg_runtime / 1000.0 ; 
+		printf(", %f GPU ms, %f x10^6 elts/sec\n", 	avg_runtime,	throughput);
+
+		//memset(h_keys,1,num_elements);
+		//memset(h_values,1,num_elements);
+		// Copy out data 
+		keysOut.read(h_keys,num_elements);
+		valuesOut.read(h_values,num_elements);
+
+		DeviceUtils::waitForCompletion( deviceData);
+	}
+    
+	// Free allocated memory
+	RadixSort32<type>::deallocate( planData);
+    delete deviceData;
+    // Clean up events
+	
+}
+
+
+
+/**
+ * Generates random 32-bit keys.
+ * 
+ * We always take the second-order byte from rand() because the higher-order 
+ * bits returned by rand() are commonly considered more uniformly distributed
+ * than the lower-order bits.
+ * 
+ * We can decrease the entropy level of keys by adopting the technique 
+ * of Thearling and Smith in which keys are computed from the bitwise AND of 
+ * multiple random samples: 
+ * 
+ * entropy_reduction	| Effectively-unique bits per key
+ * -----------------------------------------------------
+ * -1					| 0
+ * 0					| 32
+ * 1					| 25.95
+ * 2					| 17.41
+ * 3					| 10.78
+ * 4					| 6.42
+ * ...					| ...
+ * 
+ */
+template <typename K>
+void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8)
+{
+	const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char);
+	unsigned char key_bits[NUM_UCHARS];
+	
+	do {
+	
+		for (int j = 0; j < NUM_UCHARS; j++) {
+			unsigned char quarterword = 0xff;
+			for (int i = 0; i <= entropy_reduction; i++) {
+				quarterword &= (rand() >> 7);
+			}
+			key_bits[j] = quarterword;
+		}
+		
+		if (lower_key_bits < sizeof(K) * 8) {
+			unsigned long long base = 0;
+			memcpy(&base, key_bits, sizeof(K));
+			base &= (1 << lower_key_bits) - 1;
+			memcpy(key_bits, &base, sizeof(K));
+		}
+		
+		memcpy(&key, key_bits, sizeof(K));
+		
+	} while (key != key);		// avoids NaNs when generating random floating point numbers 
+}
+
+
+/******************************************************************************
+ * Templated routines for printing keys/values to the console 
+ ******************************************************************************/
+
+template<typename T> 
+void PrintValue(T val) {
+	printf("%d", val);
+}
+
+template<>
+void PrintValue<float>(float val) {
+	printf("%f", val);
+}
+
+template<>
+void PrintValue<double>(double val) {
+	printf("%f", val);
+}
+
+template<>
+void PrintValue<unsigned char>(unsigned char val) {
+	printf("%u", val);
+}
+
+template<>
+void PrintValue<unsigned short>(unsigned short val) {
+	printf("%u", val);
+}
+
+template<>
+void PrintValue<unsigned int>(unsigned int val) {
+	printf("%u", val);
+}
+
+template<>
+void PrintValue<long>(long val) {
+	printf("%ld", val);
+}
+
+template<>
+void PrintValue<unsigned long>(unsigned long val) {
+	printf("%lu", val);
+}
+
+template<>
+void PrintValue<long long>(long long val) {
+	printf("%lld", val);
+}
+
+template<>
+void PrintValue<unsigned long long>(unsigned long long val) {
+	printf("%llu", val);
+}
+
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename T, typename SizeT>
+int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true)
+{
+	printf("\n");
+	for (SizeT i = 0; i < len; i++) {
+
+		if (computed[i] != reference[i]) {
+			printf("INCORRECT: [%lu]: ", (unsigned long) i);
+			PrintValue<T>(computed[i]);
+			printf(" != ");
+			PrintValue<T>(reference[i]);
+
+			if (verbose) {
+				printf("\nresult[...");
+				for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
+					PrintValue<T>(computed[j]);
+					printf(", ");
+				}
+				printf("...]");
+				printf("\nreference[...");
+				for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) {
+					PrintValue<T>(reference[j]);
+					printf(", ");
+				}
+				printf("...]");
+			}
+
+			return 1;
+		}
+	}
+
+	printf("CORRECT\n");
+	return 0;
+}
+
+/**
+ * Creates an example sorting problem whose keys is a vector of the specified 
+ * number of K elements, values of V elements, and then dispatches the problem 
+ * to the GPU for the given number of iterations, displaying runtime information.
+ *
+ * @param[in] 		iterations  
+ * 		Number of times to invoke the GPU sorting primitive
+ * @param[in] 		num_elements 
+ * 		Size in elements of the vector to sort
+ * @param[in] 		cfg 
+ * 		Config
+ */
+template<typename K, typename V, DeviceType type>
+void TestSort(
+	unsigned int iterations,
+	int num_elements,
+	bool keys_only, const DeviceUtils::Config& cfg)
+{
+    // Allocate the sorting problem on the host and fill the keys with random bytes
+
+	K *h_keys = NULL;
+	K *h_reference_keys = NULL;
+	V *h_values = NULL;
+	h_keys = (K*) malloc(num_elements * sizeof(K));
+	h_reference_keys = (K*) malloc(num_elements * sizeof(K));
+	if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V));
+	
+
+	// Use random bits
+	for (unsigned int i = 0; i < num_elements; ++i) {
+		RandomBits<K>(h_keys[i], 0);
+		//h_keys[i] = 0xffffffffu-i;
+		if (!keys_only)
+			h_values[i] = h_keys[i];//0xffffffffu-i;
+
+		h_reference_keys[i] = h_keys[i];
+	}
+
+    // Run the timing test 
+	if (keys_only) {
+		TimedSort<K, type>(num_elements, h_keys, iterations, cfg);
+	} else {
+		TimedSort<K, V, type>(num_elements, h_keys, h_values, iterations, cfg);
+	}
+
+//	cudaThreadSynchronize();
+    
+	// Display sorted key data
+	if (g_verbose) {
+		printf("\n\nKeys:\n");
+		for (int i = 0; i < num_elements; i++) {	
+			PrintValue<K>(h_keys[i]);
+			printf(", ");
+		}
+		printf("\n\n");
+	}	
+	
+    // Verify solution
+	std::sort(h_reference_keys, h_reference_keys + num_elements);	
+	CompareResults<K>(h_keys, h_reference_keys, num_elements, true);
+	printf("\n");
+	fflush(stdout);
+
+	// Free our allocated host memory 
+	if (h_keys != NULL) free(h_keys);
+    if (h_values != NULL) free(h_values);
+}
+
+
+
+/**
+ * Displays the commandline usage for this tool
+ */
+void Usage() 
+{
+	printf("\ntest_large_problem_sorting [--device=<device index>] [--v] [--i=<num-iterations>] [--n=<num-elements>] [--keys-only]\n"); 
+	printf("\n");
+	printf("\t--v\tDisplays sorted results to the console.\n");
+	printf("\n");
+	printf("\t--i\tPerforms the sorting operation <num-iterations> times\n");
+	printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n");
+	printf("\n");
+	printf("\t--n\tThe number of elements to comprise the sample problem\n");
+	printf("\t\t\tDefault = 512\n");
+	printf("\n");
+	printf("\t--keys-only\tSpecifies that keys are not accommodated by value pairings\n");
+	printf("\n");
+}
+
+
+/******************************************************************************
+ * Command-line parsing
+ ******************************************************************************/
+#include <map>
+#include <algorithm>
+#include <string>
+
+class CommandLineArgs
+{
+protected:
+
+	std::map<std::string, std::string> pairs;
+
+public:
+
+	// Constructor
+	CommandLineArgs(int argc, char **argv)
+	{
+		using namespace std;
+
+	    for (int i = 1; i < argc; i++)
+	    {
+	        string arg = argv[i];
+
+	        if ((arg[0] != '-') || (arg[1] != '-')) {
+	        	continue;
+	        }
+
+        	string::size_type pos;
+		    string key, val;
+	        if ((pos = arg.find( '=')) == string::npos) {
+	        	key = string(arg, 2, arg.length() - 2);
+	        	val = "";
+	        } else {
+	        	key = string(arg, 2, pos - 2);
+	        	val = string(arg, pos + 1, arg.length() - 1);
+	        }
+        	pairs[key] = val;
+	    }
+	}
+
+	bool CheckCmdLineFlag(const char* arg_name)
+	{
+		using namespace std;
+		map<string, string>::iterator itr;
+		if ((itr = pairs.find(arg_name)) != pairs.end()) {
+			return true;
+	    }
+		return false;
+	}
+
+	template <typename T>
+	void GetCmdLineArgument(const char *arg_name, T &val);
+
+	int ParsedArgc()
+	{
+		return pairs.size();
+	}
+};
+
+template <typename T>
+void CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
+{
+	using namespace std;
+	map<string, string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+		istringstream strstream(itr->second);
+		strstream >> val;
+    }
+}
+
+template <>
+void CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
+{
+	using namespace std;
+	map<string, string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+
+		string s = itr->second;
+		val = (char*) malloc(sizeof(char) * (s.length() + 1));
+		strcpy(val, s.c_str());
+
+	} else {
+    	val = NULL;
+	}
+}
+
+
+
+
+
+/******************************************************************************
+ * Main
+ ******************************************************************************/
+
+int main( int argc, char** argv) 
+{
+
+	//srand(time(NULL));	
+	srand(0);				// presently deterministic
+
+    unsigned int num_elements 					= 1024*1024*12;//16*1024;//8*524288;//2048;//512;//524288;
+    unsigned int iterations  					= 10;
+    bool keys_only;
+
+    //
+	// Check command line arguments
+    //
+
+	CommandLineArgs args(argc,argv);
+
+	if (args.CheckCmdLineFlag("help"))
+	{
+		Usage();
+		return 0;
+	}
+	
+	args.GetCmdLineArgument("i", iterations);
+	args.GetCmdLineArgument("n", num_elements);
+	keys_only = args.CheckCmdLineFlag("keys-only");
+	g_verbose = args.CheckCmdLineFlag("v");
+
+	DeviceUtils::Config cfg;
+
+	// Choose AMD or NVidia
+#ifdef CL_PLATFORM_AMD
+	cfg.m_vendor = DeviceUtils::Config::VD_AMD;
+#endif
+
+#ifdef CL_PLATFORM_NVIDIA
+	cfg.m_vendor = DeviceUtils::Config::VD_NV;
+#endif
+
+	TestSort<unsigned int, unsigned int, TYPE_CL>(
+			iterations,
+			num_elements, 
+			keys_only, cfg);
+
+#ifdef ADL_ENABLE_DX11
+	TestSort<unsigned int, unsigned int, TYPE_DX11>(
+			iterations,
+			num_elements, 
+			keys_only, cfg);
+#endif //ADL_ENABLE_DX11
+}
+
+
+