Add the GPU rigid body pipeline from https://github.com/erwincoumans/experiments as a Bullet 3.x preview for Bullet 2.80

2012-03-05 00:54:32 +00:00
parent 73c4646b40
commit 571af41cf6
257 changed files with 55106 additions and 0 deletions
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlAabb.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlAabb.cpp
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlAabb.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlAabb.h
@@ -0,0 +1,230 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#ifndef AABB_H
+#define AABB_H
+
+#include "Stubs/AdlMath.h"
+#include "Stubs/AdlQuaternion.h"
+
+enum AdlCollisionShapeTypes
+{
+	ADL_SHAPE_SPHERE=2,
+	ADL_SHAPE_HEIGHT_FIELD,
+	SHAPE_CONVEX_HEIGHT_FIELD,
+};
+
+_MEM_CLASSALIGN16
+struct Aabb
+{
+	public:
+		_MEM_ALIGNED_ALLOCATOR16;
+
+		__inline
+		void setEmpty();
+		__inline
+		void includeVolume( const Aabb& aabb );
+		__inline
+		void includePoint( const float4& p );
+		__inline
+		bool overlaps( const float4& p ) const;
+		__inline
+		bool overlaps( const Aabb& aabb ) const;
+		__inline
+		float4 center() const;
+		__inline
+		int getMajorAxis() const;
+		__inline
+		float4 getExtent() const;
+		__inline
+		void expandBy( const float4& r );
+
+		__inline
+		static bool overlaps( const Aabb& a, const Aabb& b );
+
+		__inline
+		bool intersect(const float4* from, const float4* to, const float4* invRay) const;
+
+		__inline
+		void transform(const float4& translation, const Quaternion& quat);
+
+		__inline
+		void transform(const float4& translation, const Matrix3x3& rot);
+
+	public:
+		float4 m_max;
+		float4 m_min;
+};
+
+void Aabb::setEmpty()
+{
+	m_max = make_float4( -FLT_MAX );
+	m_min = make_float4( FLT_MAX );
+}
+
+void Aabb::includeVolume(const Aabb& aabb)
+{
+	m_max.x = max2( m_max.x, aabb.m_max.x );
+	m_min.x = min2( m_min.x, aabb.m_min.x );
+
+	m_max.y = max2( m_max.y, aabb.m_max.y );
+	m_min.y = min2( m_min.y, aabb.m_min.y );
+
+	m_max.z = max2( m_max.z, aabb.m_max.z );
+	m_min.z = min2( m_min.z, aabb.m_min.z );
+}
+
+void Aabb::includePoint( const float4& p )
+{
+	m_max.x = max2( m_max.x, p.x );
+	m_min.x = min2( m_min.x, p.x );
+
+	m_max.y = max2( m_max.y, p.y );
+	m_min.y = min2( m_min.y, p.y );
+
+	m_max.z = max2( m_max.z, p.z );
+	m_min.z = min2( m_min.z, p.z );
+}
+
+bool Aabb::overlaps( const float4& p ) const
+{
+	float4 dx = m_max-p;
+	float4 dm = p-m_min;
+
+	return (dx.x >= 0 && dx.y >= 0 && dx.z >= 0)
+		&& (dm.x >= 0 && dm.y >= 0 && dm.z >= 0);
+}
+
+bool Aabb::overlaps( const Aabb& in ) const
+{
+/*
+	if( m_max.x < in.m_min.x || m_min.x > in.m_max.x ) return false;
+	if( m_max.y < in.m_min.y || m_min.y > in.m_max.y ) return false;
+	if( m_max.z < in.m_min.z || m_min.z > in.m_max.z ) return false;
+
+	return true;
+*/
+	return overlaps( *this, in );
+}
+
+bool Aabb::overlaps( const Aabb& a, const Aabb& b )
+{
+	if( a.m_max.x < b.m_min.x || a.m_min.x > b.m_max.x ) return false;
+	if( a.m_max.y < b.m_min.y || a.m_min.y > b.m_max.y ) return false;
+	if( a.m_max.z < b.m_min.z || a.m_min.z > b.m_max.z ) return false;
+
+	return true;
+}
+
+float4 Aabb::center() const
+{
+	return 0.5f*(m_max+m_min);
+}
+
+int Aabb::getMajorAxis() const
+{
+	float4 extent = getExtent();
+
+	int majorAxis = 0;
+	if( extent.s[1] > extent.s[0] )
+		majorAxis = 1;
+	if( extent.s[2] > extent.s[majorAxis] )
+		majorAxis = 2;
+
+	return majorAxis;
+}
+
+float4 Aabb::getExtent() const
+{
+	return m_max-m_min;
+}
+
+void Aabb::expandBy( const float4& r )
+{
+	m_max += r;
+	m_min -= r;
+}
+
+bool Aabb::intersect(const float4* from, const float4* to, const float4* invRay) const
+{
+	float4 dFar;
+	dFar = (m_max - *from);
+	dFar *= *invRay;
+	float4 dNear;
+	dNear = (m_min - *from);
+	dNear *= *invRay;
+		
+	float4 tFar; 
+	tFar = max2(dFar, dNear);
+	float4 tNear; 
+	tNear = min2(dFar, dNear);
+
+	float farf[] = { tFar.x, tFar.y, tFar.z };
+
+	float nearf[] = { tNear.x, tNear.y, tNear.z };
+
+	float minFar = min2(farf[0], min2(farf[1], farf[2]));
+	float maxNear = max2(nearf[0], max2(nearf[1], nearf[2]));
+	
+	minFar = min2(1.0f, minFar );
+	maxNear = max2(0.0f, maxNear);
+	
+	return (minFar >= maxNear);
+}
+
+void Aabb::transform(const float4& translation, const Matrix3x3& m)
+{
+	float4 c = center();
+
+	Aabb& ans = *this;
+
+	float4 e[] = { m.m_row[0]*m_min, m.m_row[1]*m_min, m.m_row[2]*m_min };
+	float4 f[] = { m.m_row[0]*m_max, m.m_row[1]*m_max, m.m_row[2]*m_max };
+	ans.m_max = ans.m_min = translation;
+
+	{	int j=0;
+		float4 mi = make_float4( min2( e[j].x, f[j].x ), min2( e[j].y, f[j].y ), min2( e[j].z, f[j].z ) );
+		float4 ma = make_float4( max2( e[j].x, f[j].x ), max2( e[j].y, f[j].y ), max2( e[j].z, f[j].z ) );
+
+		ans.m_min.x += mi.x+mi.y+mi.z;
+		ans.m_max.x += ma.x+ma.y+ma.z;
+	}
+
+	{	int j=1;
+		float4 mi = make_float4( min2( e[j].x, f[j].x ), min2( e[j].y, f[j].y ), min2( e[j].z, f[j].z ) );
+		float4 ma = make_float4( max2( e[j].x, f[j].x ), max2( e[j].y, f[j].y ), max2( e[j].z, f[j].z ) );
+
+		ans.m_min.y += mi.x+mi.y+mi.z;
+		ans.m_max.y += ma.x+ma.y+ma.z;
+	}
+
+	{	int j=2;
+		float4 mi = make_float4( min2( e[j].x, f[j].x ), min2( e[j].y, f[j].y ), min2( e[j].z, f[j].z ) );
+		float4 ma = make_float4( max2( e[j].x, f[j].x ), max2( e[j].y, f[j].y ), max2( e[j].z, f[j].z ) );
+
+		ans.m_min.z += mi.x+mi.y+mi.z;
+		ans.m_max.z += ma.x+ma.y+ma.z;
+	}
+}
+
+void Aabb::transform(const float4& translation, const Quaternion& quat)
+{
+	Matrix3x3 m = qtGetRotationMatrix( quat );
+
+	transform( translation, m );
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlArray.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlArray.h
@@ -0,0 +1,212 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ARRAY_H
+#define ARRAY_H
+
+#include <string.h>
+#include <malloc.h>
+#include <Common/Base/Error.h>
+#include <new.h>
+
+
+template <class T>
+class Array
+{
+	public:
+		__inline
+		Array();
+		__inline
+		Array(int size);
+		__inline
+		~Array();
+		__inline
+		T& operator[] (int idx);
+		__inline
+		const T& operator[] (int idx) const;
+		__inline
+		void pushBack(const T& elem);
+		__inline
+		void popBack();
+		__inline
+		void clear();
+		__inline
+		void setSize(int size);
+		__inline
+		int getSize() const;
+		__inline
+		T* begin();
+		__inline
+		const T* begin() const;
+		__inline
+		int indexOf(const T& data) const;
+		__inline
+		void removeAt(int idx);
+		__inline
+		T& expandOne();
+
+	private:
+		Array(const Array& a){}
+
+	private:
+		enum
+		{
+			DEFAULT_SIZE = 128,
+			INCREASE_SIZE = 128,
+		};
+
+		T* m_data;
+		int m_size;
+		int m_capacity;
+};
+
+template<class T>
+Array<T>::Array()
+{
+	m_size = 0;
+	m_capacity = DEFAULT_SIZE;
+//	m_data = new T[ m_capacity ];
+	m_data = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+	for(int i=0; i<m_capacity; i++) new(&m_data[i])T;
+}
+
+template<class T>
+Array<T>::Array(int size)
+{
+	m_size = size;
+	m_capacity = size;
+//	m_data = new T[ m_capacity ];
+	m_data = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+	for(int i=0; i<m_capacity; i++) new(&m_data[i])T;
+}
+
+template<class T>
+Array<T>::~Array()
+{
+	if( m_data )
+	{
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = NULL;
+	}
+}
+
+template<class T>
+T& Array<T>::operator[](int idx)
+{
+	CLASSERT(idx<m_size);
+	return m_data[idx];
+}
+
+template<class T>
+const T& Array<T>::operator[](int idx) const
+{
+	CLASSERT(idx<m_size);
+	return m_data[idx];
+}
+
+template<class T>
+void Array<T>::pushBack(const T& elem)
+{
+	if( m_size == m_capacity )
+	{
+		int oldCap = m_capacity;
+		m_capacity += INCREASE_SIZE;
+//		T* s = new T[m_capacity];
+		T* s = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+		memcpy( s, m_data, sizeof(T)*oldCap );
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = s;
+	}
+	m_data[ m_size++ ] = elem;
+}
+
+template<class T>
+void Array<T>::popBack()
+{
+	CLASSERT( m_size>0 );
+	m_size--;
+}
+
+template<class T>
+void Array<T>::clear()
+{
+	m_size = 0;
+}
+
+template<class T>
+void Array<T>::setSize(int size)
+{
+	if( size > m_capacity )
+	{
+		int oldCap = m_capacity;
+		m_capacity = size;
+//		T* s = new T[m_capacity];
+		T* s = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+		for(int i=0; i<m_capacity; i++) new(&s[i])T;
+		memcpy( s, m_data, sizeof(T)*oldCap );
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = s;
+	}
+	m_size = size;
+}
+
+template<class T>
+int Array<T>::getSize() const
+{
+	return m_size;
+}
+
+template<class T>
+const T* Array<T>::begin() const
+{
+	return m_data;
+}
+
+template<class T>
+T* Array<T>::begin()
+{
+	return m_data;
+}
+
+template<class T>
+int Array<T>::indexOf(const T& data) const
+{
+	for(int i=0; i<m_size; i++)
+	{
+		if( data == m_data[i] ) return i;
+	}
+	return -1;
+}
+
+template<class T>
+void Array<T>::removeAt(int idx)
+{
+	CLASSERT(idx<m_size);
+	m_data[idx] = m_data[--m_size];
+}
+
+template<class T>
+T& Array<T>::expandOne()
+{
+	setSize( m_size+1 );
+	return m_data[ m_size-1 ];
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlCollideUtils.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlCollideUtils.h
@@ -0,0 +1,111 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef COLLIDE_UTILS_H
+#define COLLIDE_UTILS_H
+
+#include "Stubs/AdlMath.h"
+
+
+class CollideUtils
+{
+	public:
+		template<bool FLIPSIGN>
+		static bool collide(const float4& a, const float4& b, const float4& c, const float4& p, float4& normalOut, float margin = 0.f);
+
+		__inline
+		static float castRay(const float4& v0, const float4& v1, const float4& v2,
+			 const float4& rayFrom, const float4& rayTo, float margin = 0.0f, float4* bCrdOut = NULL);
+
+};
+
+
+template<bool FLIPSIGN>
+bool CollideUtils::collide(const float4& a, const float4& b, const float4& c, const float4& p, float4& normalOut, float margin)
+{
+	float4 ab, bc, ca;
+	ab = b-a;
+	bc = c-b;
+	ca = a-c;
+
+	float4 ap, bp, cp;
+	ap = p-a;
+	bp = p-b;
+	cp = p-c;
+
+	float4 n;
+	n = cross3(ab, -1.f*ca);
+
+	float4 abp = cross3( ab, ap );
+	float4 bcp = cross3( bc, bp );
+	float4 cap = cross3( ca, cp );
+
+	float s0 = dot3F4(n,abp);
+	float s1 = dot3F4(n,bcp);
+	float s2 = dot3F4(n,cap);
+
+//	if(( s0<0.f && s1<0.f && s2<0.f ) || ( s0>0.f && s1>0.f && s2>0.f ))
+	if(( s0<margin && s1<margin && s2<margin ) || ( s0>-margin && s1>-margin && s2>-margin ))
+	{
+		n = normalize3( n );
+		n.w = dot3F4(n,ap);
+
+		normalOut = (FLIPSIGN)? -n : n;
+		return true;
+	}
+
+	return false;
+}
+
+__inline
+float CollideUtils::castRay(const float4& v0, const float4& v1, const float4& v2,
+			 const float4& rayFrom, const float4& rayTo, float margin, float4* bCrdOut)
+{
+	float t, v, w;
+	float4 ab; ab = v1 - v0;
+	float4 ac; ac = v2 - v0;
+	float4 qp; qp = rayFrom - rayTo;
+	float4 normal = cross3( ab, ac );
+	float d = dot3F4( qp, normal );
+	float odd = 1.f/d;
+	float4 ap; ap = rayFrom - v0;
+	t = dot3F4( ap, normal );
+	t *= odd;
+//	if( t < 0.f || t > 1.f ) return -1;
+
+	float4 e = cross3( qp, ap );
+	v = dot3F4( ac, e );
+	v *= odd;
+	if( v < -margin || v > 1.f+margin ) return -1;
+	w = -dot3F4( ab, e );
+	w *= odd;
+//	if( w < 0.f || w > 1.f ) return -1;
+	if( w < -margin || w > 1.f+margin ) return -1;
+
+	float u = 1.f-v-w;
+	if( u < -margin || u > 1.f+margin ) return -1;
+	
+	if( bCrdOut )
+	{
+		bCrdOut->x = u;
+		bCrdOut->y = v;
+		bCrdOut->z = w;
+	}
+	return t;
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlCollisionShape.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlCollisionShape.h
@@ -0,0 +1,49 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef COLLISION_SHAPE_H
+#define COLLISION_SHAPE_H
+
+#include "Stubs/AdlMath.h"
+#include "Stubs/AdlAabb.h"
+
+
+_MEM_CLASSALIGN16
+class CollisionShape
+{
+	public:
+		_MEM_ALIGNED_ALLOCATOR16;
+
+		enum Type
+		{
+			SHAPE_HEIGHT_FIELD,
+			SHAPE_CONVEX_HEIGHT_FIELD,
+			SHAPE_PLANE,
+			MAX_NUM_SHAPE_TYPES,
+		};
+
+		CollisionShape( Type type, float collisionMargin = 0.0025f ) : m_type( type ){ m_collisionMargin = collisionMargin; }
+		virtual ~CollisionShape(){}
+		virtual float queryDistance(const float4& p) const = 0;
+		virtual bool queryDistanceWithNormal(const float4& p, float4& normalOut) const = 0;
+
+	public:
+		Type m_type;
+		Aabb m_aabb;
+		float m_collisionMargin;
+};
+
+#endif
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlConstraint4.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlConstraint4.h
@@ -0,0 +1,49 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_CONSTRAINT4_H
+#define ADL_CONSTRAINT4_H
+
+
+
+struct Constraint4
+		{
+			_MEM_ALIGNED_ALLOCATOR16;
+
+			float4 m_linear;
+			float4 m_worldPos[4];
+			float4 m_center;	//	friction
+			float m_jacCoeffInv[4];
+			float m_b[4];
+			float m_appliedRambdaDt[4];
+
+			float m_fJacCoeffInv[2];	//	friction
+			float m_fAppliedRambdaDt[2];	//	friction
+
+			u32 m_bodyA;
+			u32 m_bodyB;
+
+			u32 m_batchIdx;
+			u32 m_paddings[1];
+
+			__inline
+			void setFrictionCoeff(float value) { m_linear.w = value; }
+			__inline
+			float getFrictionCoeff() const { return m_linear.w; }
+		};
+
+#endif //ADL_CONSTRAINT4_H
+		
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlContact4.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlContact4.h
@@ -0,0 +1,102 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_CONTACT4_H
+#define ADL_CONTACT4_H
+
+#ifdef CL_PLATFORM_AMD
+#include "AdlConstraint4.h"
+#include "Adl/Adl.h"
+
+typedef adl::Buffer<Constraint4>* SolverData;
+#else
+typedef void* SolverData;
+#endif
+
+typedef void* ShapeDataType;
+
+
+struct Contact4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+
+	float4 m_worldPos[4];
+	float4 m_worldNormal;
+//	float m_restituitionCoeff;
+//	float m_frictionCoeff;
+	u16 m_restituitionCoeffCmp;
+	u16 m_frictionCoeffCmp;
+	int m_batchIdx;
+
+	u32 m_bodyAPtr;
+	u32 m_bodyBPtr;
+
+	//	todo. make it safer
+	int& getBatchIdx() { return m_batchIdx; }
+	float getRestituitionCoeff() const { return ((float)m_restituitionCoeffCmp/(float)0xffff); }
+	void setRestituitionCoeff( float c ) { ADLASSERT( c >= 0.f && c <= 1.f ); m_restituitionCoeffCmp = (u16)(c*0xffff); }
+	float getFrictionCoeff() const { return ((float)m_frictionCoeffCmp/(float)0xffff); }
+	void setFrictionCoeff( float c ) { ADLASSERT( c >= 0.f && c <= 1.f ); m_frictionCoeffCmp = (u16)(c*0xffff); }
+
+	float& getNPoints() { return m_worldNormal.w; }
+	float getNPoints() const { return m_worldNormal.w; }
+
+	float getPenetration(int idx) const { return m_worldPos[idx].w; }
+
+	bool isInvalid() const { return ((u32)m_bodyAPtr+(u32)m_bodyBPtr) == 0; }
+};
+
+struct ContactPoint4
+		{
+			float4 m_worldPos[4];
+			union
+			{
+				float4 m_worldNormal;
+
+				struct Data
+				{
+					int m_padding[3];
+					float m_nPoints;	//	for cl
+				}m_data;
+
+			};
+			float m_restituitionCoeff;
+			float m_frictionCoeff;
+//			int m_nPoints;
+//			int m_padding0;
+
+			void* m_bodyAPtr;
+			void* m_bodyBPtr;
+//			int m_padding1;
+//			int m_padding2;
+
+			float& getNPoints() { return m_data.m_nPoints; }
+			float getNPoints() const { return m_data.m_nPoints; }
+
+			float getPenetration(int idx) const { return m_worldPos[idx].w; }
+
+//			__inline
+//			void load(int idx, const ContactPoint& src);
+//			__inline
+//			void store(int idx, ContactPoint& dst) const;
+
+			bool isInvalid() const { return ((u32)m_bodyAPtr+(u32)m_bodyBPtr) == 0; }
+
+		};
+
+
+#endif //ADL_CONTACT4_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlError.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlError.h
@@ -0,0 +1,80 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef CL_ERROR_H
+#define CL_ERROR_H
+
+#ifdef DX11RENDER
+#include <windows.h>
+#endif
+
+#ifdef _DEBUG
+	#include <assert.h>
+	#define CLASSERT(x) if(!(x)){__debugbreak(); }
+	#define ADLASSERT(x) if(!(x)){__debugbreak(); }
+#else
+	#define CLASSERT(x) if(x){}
+	#define ADLASSERT(x) if(x){}
+
+#endif
+
+
+
+
+#ifdef _DEBUG
+	#define COMPILE_TIME_ASSERT(x) {int compileTimeAssertFailed[x]; compileTimeAssertFailed[0];}
+#else
+	#define COMPILE_TIME_ASSERT(x)
+#endif
+
+#ifdef _DEBUG
+	#include <stdarg.h>
+	#include <stdio.h>
+	__inline
+	void debugPrintf(const char *fmt, ...)
+	{
+		va_list arg;
+		va_start(arg, fmt);
+#ifdef DX11RENDER
+		char buf[256];
+		vsprintf_s( buf, 256, fmt, arg );
+#ifdef UNICODE
+		WCHAR wbuf[256];
+		int sizeWide = MultiByteToWideChar(0,0,buf,-1,wbuf,0);
+		MultiByteToWideChar(0,0,buf,-1,wbuf,sizeWide);
+
+//		swprintf_s( wbuf, 256, L"%s", buf );
+		OutputDebugString( wbuf );
+#else
+		OutputDebugString( buf );
+#endif
+#else
+		vprintf(fmt, arg);
+#endif
+		va_end(arg);
+	}
+#else
+	__inline
+	void debugPrintf(const char *fmt, ...)
+	{
+	}
+#endif
+
+
+#define WARN(msg) debugPrintf("WARNING: %s\n", msg);
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlMath.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlMath.h
@@ -0,0 +1,216 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef CL_MATH_H
+#define CL_MATH_H
+
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <xmmintrin.h>
+
+
+#include "AdlError.h"
+#include <algorithm>
+#define pxSort std::sort
+
+#define PI       3.14159265358979323846f
+#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
+
+
+#define _MEM_CLASSALIGN16 __declspec(align(16))
+#define _MEM_ALIGNED_ALLOCATOR16 	void* operator new(size_t size) { return _aligned_malloc( size, 16 ); } \
+	void operator delete(void *p) { _aligned_free( p ); } \
+	void* operator new[](size_t size) { return _aligned_malloc( size, 16 ); } \
+	void operator delete[](void *p) { _aligned_free( p ); } \
+	void* operator new(size_t size, void* p) { return p; } \
+	void operator delete(void *p, void* pp) {} 
+
+
+
+template<class T>
+T nextPowerOf2(T n)
+{
+	n -= 1;
+	for(int i=0; i<sizeof(T)*8; i++)
+		n = n | (n>>i);
+	return n+1;
+}
+
+
+_MEM_CLASSALIGN16
+struct float4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	union
+	{
+		struct
+		{
+			float x,y,z,w;
+		};
+		struct
+		{
+			float s[4];
+		};
+		__m128 m_quad;
+	};
+};
+
+__forceinline
+unsigned int isZero(const float4& a)
+{
+	return (a.x == 0.f) & (a.y == 0.f) & (a.z == 0.f) & (a.w == 0.f);
+}
+
+_MEM_CLASSALIGN16
+struct int4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	union
+	{
+		struct
+		{
+			int x,y,z,w;
+		};
+		struct
+		{
+			int s[4];
+		};
+	};
+};
+
+struct int2
+{
+	union
+	{
+		struct
+		{
+			int x,y;
+		};
+		struct
+		{
+			int s[2];
+		};
+	};
+};
+
+struct float2
+{
+	union
+	{
+		struct
+		{
+			float x,y;
+		};
+		struct
+		{
+			float s[2];
+		};
+	};
+};
+
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+
+
+#include "Adlfloat4.inl"
+//#include <Common/Math/float4SSE.inl>
+
+
+
+
+template<typename T>
+void swap2(T& a, T& b)
+{
+	T tmp = a;
+	a = b;
+	b = tmp;
+}
+
+
+__inline
+void randSeed(int seed)
+{
+	srand( seed );
+}
+
+template<typename T>
+__inline
+T randRange(const T& minV, const T& maxV)
+{
+	float r = (rand()%10000)/10000.f;
+	T range = maxV - minV;
+	return (T)(minV + r*range);
+}
+
+template<>
+__inline
+float4 randRange(const float4& minV, const float4& maxV)
+{
+	float4 r = make_float4( (rand()%10000)/10000.f, (rand()%10000)/10000.f, (rand()%10000)/10000.f, (rand()%10000)/10000.f );
+	float4 range = maxV - minV;
+	return (minV + r*range);
+}
+
+
+struct SortData
+{
+	union
+	{
+		u32 m_key;
+		struct { u16 m_key16[2]; };
+	};
+	u32 m_value;
+
+	friend bool operator <(const SortData& a, const SortData& b)
+	{
+		return a.m_key < b.m_key;
+	}
+};
+
+
+
+template<typename T>
+T* addByteOffset(void* baseAddr, u32 offset)
+{
+	return (T*)(((u32)baseAddr)+offset);
+}
+
+
+struct Pair32
+{
+	Pair32(){}
+	Pair32(u32 a, u32 b) : m_a(a), m_b(b){}
+
+	u32 m_a;
+	u32 m_b;
+};
+
+struct PtrPair
+{
+	PtrPair(){}
+	PtrPair(void* a, void* b) : m_a(a), m_b(b){}
+	template<typename T>
+	PtrPair(T* a, T* b) : m_a((void*)a), m_b((void*)b){}
+
+	void* m_a;
+	void* m_b;
+};
+
+#endif
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlMatrix3x3.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlMatrix3x3.h
@@ -0,0 +1,194 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef MATRIX3X3_H
+#define MATRIX3X3_H
+
+#include "AdlMath.h"
+
+///////////////////////////////////////
+//	Matrix3x3
+///////////////////////////////////////
+
+typedef 
+_MEM_CLASSALIGN16 struct
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	float4 m_row[3];
+}Matrix3x3;
+
+__inline
+Matrix3x3 mtZero();
+
+__inline
+Matrix3x3 mtIdentity();
+
+__inline
+Matrix3x3 mtDiagonal(float a, float b, float c);
+
+__inline
+Matrix3x3 mtTranspose(const Matrix3x3& m);
+
+__inline
+Matrix3x3 mtMul(const Matrix3x3& a, const Matrix3x3& b);
+
+__inline
+float4 mtMul1(const Matrix3x3& a, const float4& b);
+
+__inline
+Matrix3x3 mtMul2(float a, const Matrix3x3& b);
+
+__inline
+float4 mtMul3(const float4& b, const Matrix3x3& a);
+
+__inline
+Matrix3x3 mtInvert(const Matrix3x3& m);
+
+__inline
+Matrix3x3 mtZero()
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(0.f);
+	m.m_row[1] = make_float4(0.f);
+	m.m_row[2] = make_float4(0.f);
+	return m;
+}
+
+__inline
+Matrix3x3 mtIdentity()
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(1,0,0);
+	m.m_row[1] = make_float4(0,1,0);
+	m.m_row[2] = make_float4(0,0,1);
+	return m;
+}
+
+__inline
+Matrix3x3 mtDiagonal(float a, float b, float c)
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(a,0,0);
+	m.m_row[1] = make_float4(0,b,0);
+	m.m_row[2] = make_float4(0,0,c);
+	return m;
+}
+
+__inline
+Matrix3x3 mtTranspose(const Matrix3x3& m)
+{
+	Matrix3x3 out;
+	out.m_row[0] = make_float4(m.m_row[0].s[0], m.m_row[1].s[0], m.m_row[2].s[0], 0.f);
+	out.m_row[1] = make_float4(m.m_row[0].s[1], m.m_row[1].s[1], m.m_row[2].s[1], 0.f);
+	out.m_row[2] = make_float4(m.m_row[0].s[2], m.m_row[1].s[2], m.m_row[2].s[2], 0.f);
+	return out;
+}
+
+__inline
+Matrix3x3 mtMul(const Matrix3x3& a, const Matrix3x3& b)
+{
+	Matrix3x3 transB;
+	transB = mtTranspose( b );
+	Matrix3x3 ans;
+	for(int i=0; i<3; i++)
+	{
+		ans.m_row[i].s[0] = dot3F4(a.m_row[i],transB.m_row[0]);
+		ans.m_row[i].s[1] = dot3F4(a.m_row[i],transB.m_row[1]);
+		ans.m_row[i].s[2] = dot3F4(a.m_row[i],transB.m_row[2]);
+	}
+	return ans;
+}
+
+__inline
+float4 mtMul1(const Matrix3x3& a, const float4& b)
+{
+	float4 ans;
+	ans.s[0] = dot3F4( a.m_row[0], b );
+	ans.s[1] = dot3F4( a.m_row[1], b );
+	ans.s[2] = dot3F4( a.m_row[2], b );
+	return ans;
+}
+
+__inline
+Matrix3x3 mtMul2(float a, const Matrix3x3& b)
+{
+	Matrix3x3 ans;
+	ans.m_row[0] = a*b.m_row[0];
+	ans.m_row[1] = a*b.m_row[1];
+	ans.m_row[2] = a*b.m_row[2];
+	return ans;
+}
+
+__inline
+float4 mtMul3(const float4& a, const Matrix3x3& b)
+{
+	float4 ans;
+	ans.x = a.x*b.m_row[0].x + a.y*b.m_row[1].x + a.z*b.m_row[2].x;
+	ans.y = a.x*b.m_row[0].y + a.y*b.m_row[1].y + a.z*b.m_row[2].y;
+	ans.z = a.x*b.m_row[0].z + a.y*b.m_row[1].z + a.z*b.m_row[2].z;
+	return ans;
+}
+
+__inline
+Matrix3x3 mtInvert(const Matrix3x3& m)
+{
+	float det = m.m_row[0].s[0]*m.m_row[1].s[1]*m.m_row[2].s[2]+m.m_row[1].s[0]*m.m_row[2].s[1]*m.m_row[0].s[2]+m.m_row[2].s[0]*m.m_row[0].s[1]*m.m_row[1].s[2]
+	-m.m_row[0].s[0]*m.m_row[2].s[1]*m.m_row[1].s[2]-m.m_row[2].s[0]*m.m_row[1].s[1]*m.m_row[0].s[2]-m.m_row[1].s[0]*m.m_row[0].s[1]*m.m_row[2].s[2];
+
+	CLASSERT( det );
+
+	Matrix3x3 ans;
+	ans.m_row[0].s[0] = m.m_row[1].s[1]*m.m_row[2].s[2] - m.m_row[1].s[2]*m.m_row[2].s[1];
+	ans.m_row[0].s[1] = m.m_row[0].s[2]*m.m_row[2].s[1] - m.m_row[0].s[1]*m.m_row[2].s[2];
+	ans.m_row[0].s[2] = m.m_row[0].s[1]*m.m_row[1].s[2] - m.m_row[0].s[2]*m.m_row[1].s[1];
+	ans.m_row[0].w = 0.f;
+
+	ans.m_row[1].s[0] = m.m_row[1].s[2]*m.m_row[2].s[0] - m.m_row[1].s[0]*m.m_row[2].s[2];
+	ans.m_row[1].s[1] = m.m_row[0].s[0]*m.m_row[2].s[2] - m.m_row[0].s[2]*m.m_row[2].s[0];
+	ans.m_row[1].s[2] = m.m_row[0].s[2]*m.m_row[1].s[0] - m.m_row[0].s[0]*m.m_row[1].s[2];
+	ans.m_row[1].w = 0.f;
+
+	ans.m_row[2].s[0] = m.m_row[1].s[0]*m.m_row[2].s[1] - m.m_row[1].s[1]*m.m_row[2].s[0];
+	ans.m_row[2].s[1] = m.m_row[0].s[1]*m.m_row[2].s[0] - m.m_row[0].s[0]*m.m_row[2].s[1];
+	ans.m_row[2].s[2] = m.m_row[0].s[0]*m.m_row[1].s[1] - m.m_row[0].s[1]*m.m_row[1].s[0];
+	ans.m_row[2].w = 0.f;
+
+	ans = mtMul2((1.0f/det), ans);
+	return ans;
+}
+
+__inline
+Matrix3x3 mtSet( const float4& a, const float4& b, const float4& c )
+{
+	Matrix3x3 m;
+	m.m_row[0] = a;
+	m.m_row[1] = b;
+	m.m_row[2] = c;
+	return m;
+}
+
+__inline
+Matrix3x3 operator+(const Matrix3x3& a, const Matrix3x3& b)
+{
+	Matrix3x3 out;
+	out.m_row[0] = a.m_row[0] + b.m_row[0];
+	out.m_row[1] = a.m_row[1] + b.m_row[1];
+	out.m_row[2] = a.m_row[2] + b.m_row[2];
+	return out;
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlQuaternion.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlQuaternion.h
@@ -0,0 +1,155 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef QUATERNION_H
+#define QUATERNION_H
+
+#include "AdlMatrix3x3.h"
+
+
+typedef float4 Quaternion;
+
+__inline
+Quaternion qtSet(const float4& axis, float angle);
+
+__inline
+Quaternion qtMul(const Quaternion& a, const Quaternion& b);
+
+__inline
+float4 qtRotate(const Quaternion& q, const float4& vec);
+
+__inline
+float4 qtInvRotate(const Quaternion& q, const float4& vec);
+
+__inline
+Quaternion qtInvert(const Quaternion& q);
+
+__inline
+Matrix3x3 qtGetRotationMatrix(const Quaternion& quat);
+
+__inline
+Quaternion qtNormalize(const Quaternion& q);
+
+__inline
+Quaternion qtGetIdentity() { return make_float4(0,0,0,1); }
+
+__inline
+Quaternion qtSet(const float4& axis, float angle)
+{
+	float4 nAxis = normalize3( axis );
+
+	Quaternion q;
+	q.s[0] = nAxis.s[0]*sin(angle/2);
+	q.s[1] = nAxis.s[1]*sin(angle/2);
+	q.s[2] = nAxis.s[2]*sin(angle/2);
+	q.s[3] = cos(angle/2);
+	return q;
+}
+
+__inline
+Quaternion qtMul(const Quaternion& a, const Quaternion& b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.s[3]*b + b.s[3]*a;
+	ans.s[3] = a.s[3]*b.s[3] - (a.s[0]*b.s[0]+a.s[1]*b.s[1]+a.s[2]*b.s[2]);
+	return ans;
+}
+
+__inline
+float4 qtRotate(const Quaternion& q, const float4& vec)
+{
+	Quaternion vecQ = vec;
+	vecQ.s[3] = 0.f;
+	Quaternion qInv = qtInvert( q );
+	float4 out = qtMul(qtMul(q,vecQ),qInv);
+	return out;
+}
+
+__inline
+float4 qtInvRotate(const Quaternion& q, const float4& vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+__inline
+Quaternion qtInvert(const Quaternion& q)
+{
+	Quaternion ans;
+	ans.s[0] = -q.s[0];
+	ans.s[1] = -q.s[1];
+	ans.s[2] = -q.s[2];
+	ans.s[3] = q.s[3];
+	return ans;
+}
+
+__inline
+Matrix3x3 qtGetRotationMatrix(const Quaternion& quat)
+{
+	float4 quat2 = make_float4(quat.s[0]*quat.s[0], quat.s[1]*quat.s[1], quat.s[2]*quat.s[2], 0.f);
+	Matrix3x3 out;
+
+	out.m_row[0].s[0]=1-2*quat2.s[1]-2*quat2.s[2];
+	out.m_row[0].s[1]=2*quat.s[0]*quat.s[1]-2*quat.s[3]*quat.s[2];
+	out.m_row[0].s[2]=2*quat.s[0]*quat.s[2]+2*quat.s[3]*quat.s[1];
+	out.m_row[0].s[3] = 0.f;
+
+	out.m_row[1].s[0]=2*quat.s[0]*quat.s[1]+2*quat.s[3]*quat.s[2];
+	out.m_row[1].s[1]=1-2*quat2.s[0]-2*quat2.s[2];
+	out.m_row[1].s[2]=2*quat.s[1]*quat.s[2]-2*quat.s[3]*quat.s[0];
+	out.m_row[1].s[3] = 0.f;
+
+	out.m_row[2].s[0]=2*quat.s[0]*quat.s[2]-2*quat.s[3]*quat.s[1];
+	out.m_row[2].s[1]=2*quat.s[1]*quat.s[2]+2*quat.s[3]*quat.s[0];
+	out.m_row[2].s[2]=1-2*quat2.s[0]-2*quat2.s[1];
+	out.m_row[2].s[3] = 0.f;
+
+	return out;
+}
+
+__inline
+Quaternion qtGetQuaternion(const Matrix3x3* m)
+{
+	Quaternion q;
+	q.w = sqrtf( m[0].m_row[0].x + m[0].m_row[1].y + m[0].m_row[2].z + 1 ) * 0.5f;
+	float inv4w = 1.f/(4.f*q.w);
+	q.x = (m[0].m_row[2].y-m[0].m_row[1].z)*inv4w;
+	q.y = (m[0].m_row[0].z-m[0].m_row[2].x)*inv4w;
+	q.z = (m[0].m_row[1].x-m[0].m_row[0].y)*inv4w;
+
+	return q;
+}
+
+__inline
+Quaternion qtNormalize(const Quaternion& q)
+{
+	return normalize4(q);
+}
+
+__inline
+float4 transform(const float4& p, const float4& translation, const Quaternion& orientation)
+{
+	return qtRotate( orientation, p ) + translation;
+}
+
+__inline
+float4 invTransform(const float4& p, const float4& translation, const Quaternion& orientation)
+{
+	return qtRotate( qtInvert( orientation ), p-translation ); // use qtInvRotate
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlRigidBody.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlRigidBody.h
@@ -0,0 +1,59 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_RIGID_BODY_H
+#define ADL_RIGID_BODY_H
+
+#include "AdlQuaternion.h"
+
+class RigidBodyBase
+{
+	public:
+
+		_MEM_CLASSALIGN16
+		struct Body
+		{
+			_MEM_ALIGNED_ALLOCATOR16;
+
+			float4 m_pos;
+			Quaternion m_quat;
+			float4 m_linVel;
+			float4 m_angVel;
+
+			u32 m_shapeIdx;
+			u32 m_shapeType;
+
+			float m_invMass;
+			float m_restituitionCoeff;
+			float m_frictionCoeff;
+			
+		};
+
+		struct Inertia
+		{
+/*			u16 m_shapeType;
+			u16 m_shapeIdx;
+			float m_restituitionCoeff;
+			float m_frictionCoeff;
+			int m_padding;
+*/
+			Matrix3x3 m_invInertia;
+			Matrix3x3 m_initInvInertia;
+		};
+};
+
+#endif// ADL_RIGID_BODY_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlTransform.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlTransform.h
@@ -0,0 +1,61 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef _ADL_TRANSFORM_H
+#define _ADL_TRANSFORM_H
+
+#include "AdlMath.h"
+#include "AdlQuaternion.h"
+#include "AdlMatrix3x3.h"
+
+struct Transform
+{
+	float4 m_translation;
+	Matrix3x3 m_rotation;
+};
+
+Transform trSetTransform(const float4& translation, const Quaternion& quat)
+{
+	Transform tr;
+	tr.m_translation = translation;
+	tr.m_rotation = qtGetRotationMatrix( quat );
+	return tr;
+}
+
+Transform trInvert( const Transform& tr )
+{
+	Transform ans;
+	ans.m_rotation = mtTranspose( tr.m_rotation );
+	ans.m_translation = mtMul1( ans.m_rotation, -tr.m_translation );
+	return ans;
+}
+
+Transform trMul(const Transform& trA, const Transform& trB)
+{
+	Transform ans; 
+	ans.m_rotation = mtMul( trA.m_rotation, trB.m_rotation );
+	ans.m_translation = mtMul1( trA.m_rotation, trB.m_translation ) + trA.m_translation;
+	return ans;
+}
+
+float4 trMul1(const Transform& tr, const float4& p)
+{
+	return mtMul1( tr.m_rotation, p ) + tr.m_translation;
+}
+
+
+#endif //_ADL_TRANSFORM_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Adlfloat4.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Adlfloat4.inl
@@ -0,0 +1,373 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+//#define CHECK_ALIGNMENT(a) CLASSERT((u32(&(a)) & 0xf) == 0);
+#define CHECK_ALIGNMENT(a) a;
+
+
+__inline
+float4 make_float4(float x, float y, float z, float w = 0.f)
+{
+	float4 v;
+	v.x = x; v.y = y; v.z = z; v.w = w;
+	return v;
+}
+
+__inline
+float4 make_float4(float x)
+{
+	return make_float4(x,x,x,x);
+}
+
+__inline
+float4 make_float4(const int4& x)
+{
+	return make_float4((float)x.s[0], (float)x.s[1], (float)x.s[2], (float)x.s[3]);
+}
+
+__inline
+float2 make_float2(float x, float y)
+{
+	float2 v;
+	v.s[0] = x; v.s[1] = y;
+	return v;
+}
+
+__inline
+float2 make_float2(float x)
+{
+	return make_float2(x,x);
+}
+
+__inline
+float2 make_float2(const int2& x)
+{
+	return make_float2((float)x.s[0], (float)x.s[1]);
+}
+
+__inline
+int4 make_int4(int x, int y, int z, int w = 0)
+{
+	int4 v;
+	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	return v;
+}
+
+__inline
+int4 make_int4(int x)
+{
+	return make_int4(x,x,x,x);
+}
+
+__inline
+int4 make_int4(const float4& x)
+{
+	return make_int4((int)x.x, (int)x.y, (int)x.z, (int)x.w);
+}
+
+__inline
+int2 make_int2(int a, int b)
+{
+	int2 ans; ans.x = a; ans.y = b;
+	return ans;
+}
+
+__inline
+float4 operator-(const float4& a)
+{
+	return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+
+__inline
+float4 operator*(const float4& a, const float4& b)
+{
+	CLASSERT((u32(&a) & 0xf) == 0);
+
+	float4 out;
+	out.s[0] = a.s[0]*b.s[0];
+	out.s[1] = a.s[1]*b.s[1];
+	out.s[2] = a.s[2]*b.s[2];
+	out.s[3] = a.s[3]*b.s[3];
+	return out;
+}
+
+__inline
+float4 operator*(float a, const float4& b)
+{
+	return make_float4(a*b.s[0], a*b.s[1], a*b.s[2], a*b.s[3]);
+}
+
+__inline
+float4 operator*(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	return make_float4(a*b.s[0], a*b.s[1], a*b.s[2], a*b.s[3]);
+}
+
+__inline
+void operator*=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]*=b.s[0];
+	a.s[1]*=b.s[1];
+	a.s[2]*=b.s[2];
+	a.s[3]*=b.s[3];
+}
+
+__inline
+void operator*=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]*=b;
+	a.s[1]*=b;
+	a.s[2]*=b;
+	a.s[3]*=b;
+}
+
+//
+__inline
+float4 operator/(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]/b.s[0];
+	out.s[1] = a.s[1]/b.s[1];
+	out.s[2] = a.s[2]/b.s[2];
+	out.s[3] = a.s[3]/b.s[3];
+	return out;
+}
+
+__inline
+float4 operator/(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	return make_float4(b.s[0]/a, b.s[1]/a, b.s[2]/a, b.s[3]/a);
+}
+
+__inline
+void operator/=(float4& a, const float4& b)
+{
+	a.s[0]/=b.s[0];
+	a.s[1]/=b.s[1];
+	a.s[2]/=b.s[2];
+	a.s[3]/=b.s[3];
+}
+
+__inline
+void operator/=(float4& a, float b)
+{
+	CLASSERT((u32(&a) & 0xf) == 0);
+
+	a.s[0]/=b;
+	a.s[1]/=b;
+	a.s[2]/=b;
+	a.s[3]/=b;
+}
+//
+
+__inline
+float4 operator+(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]+b.s[0];
+	out.s[1] = a.s[1]+b.s[1];
+	out.s[2] = a.s[2]+b.s[2];
+	out.s[3] = a.s[3]+b.s[3];
+	return out;
+}
+
+__inline
+float4 operator+(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]+b;
+	out.s[1] = a.s[1]+b;
+	out.s[2] = a.s[2]+b;
+	out.s[3] = a.s[3]+b;
+	return out;
+}
+
+__inline
+float4 operator-(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]-b.s[0];
+	out.s[1] = a.s[1]-b.s[1];
+	out.s[2] = a.s[2]-b.s[2];
+	out.s[3] = a.s[3]-b.s[3];
+	return out;
+}
+
+__inline
+float4 operator-(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]-b;
+	out.s[1] = a.s[1]-b;
+	out.s[2] = a.s[2]-b;
+	out.s[3] = a.s[3]-b;
+	return out;
+}
+
+__inline
+void operator+=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]+=b.s[0];
+	a.s[1]+=b.s[1];
+	a.s[2]+=b.s[2];
+	a.s[3]+=b.s[3];
+}
+
+__inline
+void operator+=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]+=b;
+	a.s[1]+=b;
+	a.s[2]+=b;
+	a.s[3]+=b;
+}
+
+__inline
+void operator-=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]-=b.s[0];
+	a.s[1]-=b.s[1];
+	a.s[2]-=b.s[2];
+	a.s[3]-=b.s[3];
+}
+
+__inline
+void operator-=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]-=b;
+	a.s[1]-=b;
+	a.s[2]-=b;
+	a.s[3]-=b;
+}
+
+
+
+
+
+__inline
+float4 cross3(const float4& a, const float4& b)
+{
+	return make_float4(a.s[1]*b.s[2]-a.s[2]*b.s[1], 
+		a.s[2]*b.s[0]-a.s[0]*b.s[2], 
+		a.s[0]*b.s[1]-a.s[1]*b.s[0], 
+		0);
+}
+
+__inline
+float dot3F4(const float4& a, const float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z;
+}
+
+__inline
+float length3(const float4& a)
+{
+	return sqrtf(dot3F4(a,a));
+}
+
+__inline
+float dot4(const float4& a, const float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
+}
+
+//	for height
+__inline
+float dot3w1(const float4& point, const float4& eqn)
+{
+	return point.x*eqn.x+point.y*eqn.y+point.z*eqn.z+eqn.w;
+}
+
+__inline
+float4 normalize3(const float4& a)
+{
+	float length = sqrtf(dot3F4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 normalize4(const float4& a)
+{
+	float length = sqrtf(dot4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 createEquation(const float4& a, const float4& b, const float4& c)
+{
+	float4 eqn;
+	float4 ab = b-a;
+	float4 ac = c-a;
+	eqn = normalize3( cross3(ab, ac) );
+	eqn.w = -dot3F4(eqn,a);
+	return eqn;
+}
+
+
+template<typename T>
+__inline
+T max2(const T& a, const T& b)
+{
+	return (a>b)? a:b;
+}
+
+template<typename T>
+__inline
+T min2(const T& a, const T& b)
+{
+	return (a<b)? a:b;
+}
+
+template<>
+__inline
+float4 max2(const float4& a, const float4& b)
+{
+	return make_float4( max2(a.x,b.x), max2(a.y,b.y), max2(a.z,b.z), max2(a.w,b.w) );
+}
+
+template<>
+__inline
+float4 min2(const float4& a, const float4& b)
+{
+	return make_float4( min2(a.x,b.x), min2(a.y,b.y), min2(a.z,b.z), min2(a.w,b.w) );
+}
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Adlfloat4SSE.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Adlfloat4SSE.inl
@@ -0,0 +1,381 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+//#define CHECK_ALIGNMENT(a) CLASSERT((u32(&(a)) & 0xf) == 0);
+#define CHECK_ALIGNMENT(a) a;
+
+
+__inline
+float4 make_float4(float x, float y, float z, float w = 0.f)
+{
+	float4 v;
+	v.m_quad = _mm_set_ps(w,z,y,x);
+
+	return v;
+}
+
+__inline
+float4 make_float4(float x)
+{
+	return make_float4(x,x,x,x);
+}
+
+__inline
+float4 make_float4(const int4& x)
+{
+	return make_float4((float)x.s[0], (float)x.s[1], (float)x.s[2], (float)x.s[3]);
+}
+
+__inline
+float2 make_float2(float x, float y)
+{
+	float2 v;
+	v.s[0] = x; v.s[1] = y;
+	return v;
+}
+
+__inline
+float2 make_float2(float x)
+{
+	return make_float2(x,x);
+}
+
+__inline
+float2 make_float2(const int2& x)
+{
+	return make_float2((float)x.s[0], (float)x.s[1]);
+}
+
+__inline
+int4 make_int4(int x, int y, int z, int w = 0)
+{
+	int4 v;
+	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	return v;
+}
+
+__inline
+int4 make_int4(int x)
+{
+	return make_int4(x,x,x,x);
+}
+
+__inline
+int4 make_int4(const float4& x)
+{
+	return make_int4((int)x.x, (int)x.y, (int)x.z, (int)x.w);
+}
+
+__inline
+int2 make_int2(int a, int b)
+{
+	int2 ans; ans.x = a; ans.y = b;
+	return ans;
+}
+
+__inline
+float4 operator-(const float4& a)
+{
+	float4 zero; zero.m_quad = _mm_setzero_ps();
+	float4 ans; ans.m_quad = _mm_sub_ps( zero.m_quad, a.m_quad );
+	return ans;
+}
+
+__inline
+float4 operator*(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.m_quad = _mm_mul_ps( a.m_quad, b.m_quad );
+	return out;
+}
+
+__inline
+float4 operator*(float a, const float4& b)
+{
+	float4 av; av.m_quad = _mm_set1_ps( a );
+	return av*b;
+}
+
+__inline
+float4 operator*(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	float4 av; av.m_quad = _mm_set1_ps( a );
+	return av*b;
+}
+
+__inline
+void operator*=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a = a*b;
+}
+
+__inline
+void operator*=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+	a = a*bv;
+}
+
+//
+__inline
+float4 operator/(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.m_quad = _mm_div_ps( a.m_quad, b.m_quad );
+	return out;
+}
+
+__inline
+float4 operator/(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	float4 av; av.m_quad = _mm_set1_ps( a );
+	float4 out;
+	out = b/av;
+	return out;
+}
+
+__inline
+void operator/=(float4& a, const float4& b)
+{
+	a = a/b;
+}
+
+__inline
+void operator/=(float4& a, float b)
+{
+	CLASSERT((u32(&a) & 0xf) == 0);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+	a = a/bv;
+}
+//
+
+__inline
+float4 operator+(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.m_quad = _mm_add_ps( a.m_quad, b.m_quad );
+	return out;
+}
+
+__inline
+float4 operator+(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+	return a+bv;
+}
+
+__inline
+float4 operator-(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.m_quad = _mm_sub_ps( a.m_quad, b.m_quad );
+	return out;
+}
+
+__inline
+float4 operator-(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+	return a-bv;
+}
+
+__inline
+void operator+=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a = a + b;
+}
+
+__inline
+void operator+=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+
+	a = a + bv;
+}
+
+__inline
+void operator-=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a = a - b;
+}
+
+__inline
+void operator-=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+
+	a = a - bv;
+}
+
+
+
+
+
+__inline
+float4 cross3(const float4& a, const float4& b)
+{	//	xnamathvector.inl
+	union IntVec
+	{
+		unsigned int m_i[4];
+		__m128 m_v;
+	};
+
+	IntVec mask3 = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000};
+	__m128 V1 = a.m_quad;
+	__m128 V2 = b.m_quad;
+
+    __m128 vTemp1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(3,0,2,1));
+    // z2,x2,y2,w2
+    __m128 vTemp2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(3,1,0,2));
+    // Perform the left operation
+    __m128 vResult = _mm_mul_ps(vTemp1,vTemp2);
+    // z1,x1,y1,w1
+    vTemp1 = _mm_shuffle_ps(vTemp1,vTemp1,_MM_SHUFFLE(3,0,2,1));
+    // y2,z2,x2,w2
+    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(3,1,0,2));
+    // Perform the right operation
+    vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
+    // Subract the right from left, and return answer
+    vResult = _mm_sub_ps(vResult,vTemp1);
+    // Set w to zero
+	float4 ans; ans.m_quad = _mm_and_ps(vResult,mask3.m_v);
+	return ans;
+}
+
+__inline
+float dot3F4(const float4& a, const float4& b)
+{
+//	return a.x*b.x+a.y*b.y+a.z*b.z;
+    // Perform the dot product
+	__m128 V1 = a.m_quad;
+	__m128 V2 = b.m_quad;
+
+	__m128 vDot = _mm_mul_ps(V1,V2);
+    // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
+    __m128 vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1));
+    // Result.vector4_f32[0] = x+y
+    vDot = _mm_add_ss(vDot,vTemp);
+    // x=Dot.vector4_f32[2]
+    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+    // Result.vector4_f32[0] = (x+y)+z
+    vDot = _mm_add_ss(vDot,vTemp);
+    // Splat x
+	float4 ans; ans.m_quad = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0));
+	return ans.x;
+}
+
+__inline
+float length3(const float4& a)
+{
+	return sqrtf(dot3F4(a,a));
+}
+
+__inline
+float dot4(const float4& a, const float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
+}
+
+//	for height
+__inline
+float dot3w1(const float4& point, const float4& eqn)
+{
+	return point.x*eqn.x+point.y*eqn.y+point.z*eqn.z+eqn.w;
+}
+
+__inline
+float4 normalize3(const float4& a)
+{
+	float length = sqrtf(dot3F4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 normalize4(const float4& a)
+{
+	float length = sqrtf(dot4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 createEquation(const float4& a, const float4& b, const float4& c)
+{
+	float4 eqn;
+	float4 ab = b-a;
+	float4 ac = c-a;
+	eqn = normalize3( cross3(ab, ac) );
+	eqn.w = -dot3F4(eqn,a);
+	return eqn;
+}
+
+
+template<typename T>
+__inline
+T max2(const T& a, const T& b)
+{
+	return (a>b)? a:b;
+}
+
+template<typename T>
+__inline
+T min2(const T& a, const T& b)
+{
+	return (a<b)? a:b;
+}
+
+template<>
+__inline
+float4 max2(const float4& a, const float4& b)
+{
+	return make_float4( max2(a.x,b.x), max2(a.y,b.y), max2(a.z,b.z), max2(a.w,b.w) );
+}
+
+template<>
+__inline
+float4 min2(const float4& a, const float4& b)
+{
+	return make_float4( min2(a.x,b.x), min2(a.y,b.y), min2(a.z,b.z), min2(a.w,b.w) );
+}
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowPhase.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowPhase.h
@@ -0,0 +1,154 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+
+#include <Adl/Adl.h>
+//#include <Common/Base/SyncObjects.h>
+
+#include "AdlMath.h"
+#include "AdlContact4.h"
+#include "AdlRigidBody.h"
+
+#include "../ConvexHeightFieldShape.h"
+
+//#include "TypeDefinition.h"
+//#include "RigidBody.h"
+//#include "ConvexHeightFieldShape.h"
+
+namespace adl
+{
+class ShapeBase;
+
+class ChNarrowphaseBase
+{
+	public:
+		struct Config
+		{
+			float m_collisionMargin;
+		};
+/*
+		typedef struct
+		{
+			//	m_normal.w == height in u8
+			float4 m_normal[HEIGHT_RES*HEIGHT_RES*6];
+			u32 m_height4[HEIGHT_RES*HEIGHT_RES*6];
+
+			float m_scale;
+			float m_padding0;
+			float m_padding1;
+			float m_padding2;
+		} ShapeData;
+*/
+};
+
+template<DeviceType TYPE>
+class ChNarrowphase : public ChNarrowphaseBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			const Device* m_device;
+			Kernel* m_supportCullingKernel;
+			Kernel* m_narrowphaseKernel;
+			Kernel* m_narrowphaseWithPlaneKernel;
+
+			Buffer<u32>* m_counterBuffer;
+		};
+
+		enum
+		{
+			N_TASKS = 4,
+			HEIGHT_RES = ConvexHeightField::HEIGHT_RES,
+		};
+
+		struct ShapeData
+		{
+			float4 m_normal[HEIGHT_RES*HEIGHT_RES*6];
+			u32 m_height4[HEIGHT_RES*HEIGHT_RES*6];
+			u32 m_supportHeight4[HEIGHT_RES*HEIGHT_RES*6];
+
+			float m_scale;
+			float m_padding0;
+			float m_padding1;
+			float m_padding2;
+		};
+
+		struct ConstData
+		{
+			int m_nPairs;
+			float m_collisionMargin;
+			int m_capacity;
+			int m_paddings[1];
+		};
+		
+		static
+		Data* allocate( const Device* device );
+
+		static
+		void deallocate( Data* data );
+/*
+		static
+		Buffer<ShapeData>* allocateShapeBuffer( const Device* device, int capacity );
+
+		static
+		void deallocateShapeBuffer( Buffer<ShapeData>* shapeBuf );
+
+		static
+		void setShape( Buffer<ShapeData>* shapeBuf, ShapeBase* shape, int idx, float collisionMargin );
+*/
+		static
+		ShapeDataType allocateShapeBuffer( const Device* device, int capacity );
+
+		static
+		void deallocateShapeBuffer( ShapeDataType shapeBuf );
+
+		static
+		void setShape( ShapeDataType shapeBuf, ShapeBase* shape, int idx, float collisionMargin = 0.f );
+		
+		static
+		void setShape( ShapeDataType shapeBuf, ConvexHeightField* cvxShape, int idx, float collisionMargin = 0.f );
+
+		// Run NarrowphaseKernel
+		//template<bool USE_OMP>
+		static
+		void execute( Data* data, const Buffer<int2>* pairs, int nPairs, 
+			const Buffer<RigidBodyBase::Body>* bodyBuf, const ShapeDataType shapeBuf,
+			Buffer<Contact4>* contactOut, int& nContacts, const Config& cfg );
+
+		// Run NarrowphaseWithPlaneKernel
+		//template<bool USE_OMP>
+		static
+		void execute( Data* data, const Buffer<int2>* pairs, int nPairs, 
+			const Buffer<RigidBodyBase::Body>* bodyBuf, const ShapeDataType shapeBuf,
+			const Buffer<float4>* vtxBuf, const Buffer<int4>* idxBuf,
+			Buffer<Contact4>* contactOut, int& nContacts, const Config& cfg );
+
+		// Run SupportCullingKernel
+		//template<bool USE_OMP>
+		static
+		int culling( Data* data, const Buffer<int2>* pairs, int nPairs, const Buffer<RigidBodyBase::Body>* bodyBuf,
+			const ShapeDataType shapeBuf, const Buffer<int2>* pairsOut, const Config& cfg );
+};
+
+//#include <AdlPhysics/Narrowphase/ChNarrowphase.inl>
+//#include <AdlPhysics/Narrowphase/ChNarrowphaseHost.inl>
+
+#include "ChNarrowphase.inl"
+
+};
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphase.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphase.inl
@@ -0,0 +1,303 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+//#define PATH "..\\..\\dynamics\\basic_demo\\Stubs\\ChNarrowphaseKernels"
+#define PATH "..\\..\\dynamics\\basic_demo\\Stubs\\ChNarrowphaseKernels"
+#define KERNEL0 "SupportCullingKernel"
+#define KERNEL1 "NarrowphaseKernel"
+
+#include "ChNarrowphaseKernels.h"
+
+class ChNarrowphaseImp
+{
+public:
+	static
+	__inline
+	u32 u32Pack(u8 x, u8 y, u8 z, u8 w)
+	{
+		return (x) | (y<<8) | (z<<16) | (w<<24);
+	}
+
+};
+
+template<DeviceType TYPE>
+typename ChNarrowphase<TYPE>::Data* ChNarrowphase<TYPE>::allocate( const Device* device )
+{
+	char options[100];
+	
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{narrowphaseKernelsCL, 0};
+#else
+		{0,0};
+#endif
+
+	
+
+
+	//sprintf(options, "-I ..\\..\\ -Wf,--c++");
+	sprintf(options, "-I .\\NarrowPhaseCL\\");
+
+	Data* data = new Data;
+	data->m_device = device;
+	data->m_supportCullingKernel = device->getKernel( PATH, KERNEL0, options,src[TYPE] );
+	data->m_narrowphaseKernel = device->getKernel( PATH, KERNEL1, options, src[TYPE]);
+	data->m_narrowphaseWithPlaneKernel = device->getKernel( PATH, "NarrowphaseWithPlaneKernel", options,src[TYPE]);
+	data->m_counterBuffer = new Buffer<u32>( device, 1 );
+
+	return data;
+}
+
+
+template<DeviceType TYPE>
+void ChNarrowphase<TYPE>::deallocate( Data* data )
+{
+	delete data->m_counterBuffer;
+
+	delete data;
+}
+
+template<DeviceType TYPE>
+ShapeDataType ChNarrowphase<TYPE>::allocateShapeBuffer( const Device* device, int capacity )
+{
+	ADLASSERT( device->m_type == TYPE );
+
+	return new Buffer<ShapeData>( device, capacity );
+}
+
+template<DeviceType TYPE>
+void ChNarrowphase<TYPE>::deallocateShapeBuffer( ShapeDataType shapeBuf )
+{
+	Buffer<ShapeData>* s = (Buffer<ShapeData>*)shapeBuf;
+	delete s;
+}
+
+template<DeviceType TYPE>
+void ChNarrowphase<TYPE>::setShape( ShapeDataType shapeBuf, ShapeBase* shape, int idx, float collisionMargin )
+{
+	ConvexHeightField* cvxShape = new ConvexHeightField( shape );
+	Buffer<ShapeData>* dst = (Buffer<ShapeData>*)shapeBuf;
+	cvxShape->m_aabb.expandBy( make_float4( collisionMargin ) );
+	{
+		ShapeData s;
+		{
+			for(int j=0; j<HEIGHT_RES*HEIGHT_RES*6; j++)
+			{
+				s.m_normal[j] = cvxShape->m_normal[j];
+			}
+			for(int j=0; j<HEIGHT_RES*HEIGHT_RES*6/4; j++)
+			{
+				s.m_height4[j] = ChNarrowphaseImp::u32Pack( cvxShape->m_data[4*j], cvxShape->m_data[4*j+1], cvxShape->m_data[4*j+2], cvxShape->m_data[4*j+3] );
+				s.m_supportHeight4[j] = ChNarrowphaseImp::u32Pack( cvxShape->m_supportHeight[4*j], cvxShape->m_supportHeight[4*j+1], cvxShape->m_supportHeight[4*j+2], cvxShape->m_supportHeight[4*j+3] );
+			}
+			s.m_scale = cvxShape->m_scale;
+		}
+		dst->write( &s, 1, idx );
+		DeviceUtils::waitForCompletion( dst->m_device );
+	}
+	delete cvxShape;
+}
+
+template<DeviceType TYPE>
+void ChNarrowphase<TYPE>::setShape( ShapeDataType shapeBuf, ConvexHeightField* cvxShape, int idx, float collisionMargin )
+{
+	Buffer<ShapeData>* dst = (Buffer<ShapeData>*)shapeBuf;
+	cvxShape->m_aabb.expandBy( make_float4( collisionMargin ) );
+	{
+		ShapeData s;
+		{
+			for(int j=0; j<HEIGHT_RES*HEIGHT_RES*6; j++)
+			{
+				s.m_normal[j] = cvxShape->m_normal[j];
+			}
+			for(int j=0; j<HEIGHT_RES*HEIGHT_RES*6/4; j++)
+			{
+				s.m_height4[j] = ChNarrowphaseImp::u32Pack( cvxShape->m_data[4*j], cvxShape->m_data[4*j+1], cvxShape->m_data[4*j+2], cvxShape->m_data[4*j+3] );
+				s.m_supportHeight4[j] = ChNarrowphaseImp::u32Pack( cvxShape->m_supportHeight[4*j], cvxShape->m_supportHeight[4*j+1], cvxShape->m_supportHeight[4*j+2], cvxShape->m_supportHeight[4*j+3] );
+			}
+			s.m_scale = cvxShape->m_scale;
+		}
+		dst->write( &s, 1, idx );
+		DeviceUtils::waitForCompletion( dst->m_device );
+	}
+}
+
+// Run NarrowphaseKernel
+template<DeviceType TYPE>
+//template<bool USE_OMP>
+void ChNarrowphase<TYPE>::execute( Data* data, const Buffer<int2>* pairs, int nPairs, const Buffer<RigidBodyBase::Body>* bodyBuf,
+			const ShapeDataType shapeBuf,
+			Buffer<Contact4>* contactOut, int& nContacts, const Config& cfg )
+{
+	if( nPairs == 0 ) return;
+
+	Buffer<ShapeData>* shapeBuffer = (Buffer<ShapeData>*)shapeBuf;
+	ADLASSERT( shapeBuffer->getType() == TYPE );
+
+	const Device* device = data->m_device;
+
+	Buffer<int2>* gPairsInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, pairs );
+	Buffer<RigidBodyBase::Body>* gBodyInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );
+	Buffer<Contact4>* gContactOutNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, contactOut );	//	this might not be empty
+
+	Buffer<ConstData> constBuffer( device, 1, BufferBase::BUFFER_CONST );
+
+	ConstData cdata;
+	cdata.m_nPairs = nPairs;
+	cdata.m_collisionMargin = cfg.m_collisionMargin;
+	cdata.m_capacity = contactOut->getSize() - nContacts;
+
+	u32 n = nContacts;
+	data->m_counterBuffer->write( &n, 1 );
+//	DeviceUtils::waitForCompletion( device );
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( gPairsInNative, true ), BufferInfo( shapeBuffer ), BufferInfo( gBodyInNative ), 
+			BufferInfo( gContactOutNative ),
+			BufferInfo( data->m_counterBuffer ) };
+		Launcher launcher( data->m_device, data->m_narrowphaseKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( nPairs*64, 64 );
+	}
+
+	data->m_counterBuffer->read( &n, 1 );
+	DeviceUtils::waitForCompletion( device );
+
+	BufferUtils::unmap<false>( gPairsInNative, pairs );
+	BufferUtils::unmap<false>( gBodyInNative, bodyBuf );
+	BufferUtils::unmap<true>( gContactOutNative, contactOut );
+
+	nContacts = min2((int)n, contactOut->getSize() );
+}
+
+// Run NarrowphaseWithPlaneKernel
+template<DeviceType TYPE>
+//template<bool USE_OMP>
+void ChNarrowphase<TYPE>::execute( Data* data, const Buffer<int2>* pairs, int nPairs, 
+			const Buffer<RigidBodyBase::Body>* bodyBuf, const ShapeDataType shapeBuf,
+			const Buffer<float4>* vtxBuf, const Buffer<int4>* idxBuf,
+			Buffer<Contact4>* contactOut, int& nContacts, const Config& cfg )
+{
+	if( nPairs == 0 ) return;
+
+	Buffer<ShapeData>* shapeBuffer = (Buffer<ShapeData>*)shapeBuf;
+	ADLASSERT( shapeBuffer->getType() == TYPE );
+
+	const Device* device = data->m_device;
+
+	Buffer<int2>* gPairsInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, pairs );
+	Buffer<RigidBodyBase::Body>* gBodyInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );	
+	Buffer<Contact4>* gContactOutNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, contactOut );	//	this might not be empty
+
+	Buffer<ConstData> constBuffer( device, 1, BufferBase::BUFFER_CONST );
+
+	ConstData cdata;
+	cdata.m_nPairs = nPairs;
+	cdata.m_collisionMargin = cfg.m_collisionMargin;
+	cdata.m_capacity = contactOut->getSize() - nContacts;
+
+	u32 n = nContacts;
+	data->m_counterBuffer->write( &n, 1 );
+//	DeviceUtils::waitForCompletion( device );
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( gPairsInNative, true ), BufferInfo( shapeBuffer ), BufferInfo( gBodyInNative ), 
+			BufferInfo( gContactOutNative ),
+			BufferInfo( data->m_counterBuffer ) };
+		Launcher launcher( data->m_device, data->m_narrowphaseWithPlaneKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( nPairs*64, 64 );
+	}
+
+	data->m_counterBuffer->read( &n, 1 );
+	DeviceUtils::waitForCompletion( device );
+
+	BufferUtils::unmap<false>( gPairsInNative, pairs );
+	BufferUtils::unmap<false>( gBodyInNative, bodyBuf );
+	BufferUtils::unmap<true>( gContactOutNative, contactOut );
+
+	nContacts = min2((int)n, contactOut->getSize() );
+}
+
+// Run SupportCullingKernel
+template<DeviceType TYPE>
+//template<bool USE_OMP>
+int ChNarrowphase<TYPE>::culling( Data* data, const Buffer<int2>* pairs, int nPairs, const Buffer<RigidBodyBase::Body>* bodyBuf,
+			const ShapeDataType shapeBuf, const Buffer<int2>* pairsOut, const Config& cfg )
+{
+	if( nPairs == 0 ) return 0;
+
+	Buffer<ShapeData>* shapeBuffer = (Buffer<ShapeData>*)shapeBuf;
+	ADLASSERT( shapeBuffer->getType() == TYPE );
+
+	const Device* device = data->m_device;
+
+	Buffer<int2>* gPairsInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, pairs );
+	Buffer<RigidBodyBase::Body>* gBodyInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );	
+	Buffer<int2>* gPairsOutNative 
+		= BufferUtils::map<TYPE, false>( data->m_device, pairsOut );
+
+	//
+	Buffer<ConstData> constBuffer( device, 1, BufferBase::BUFFER_CONST );
+
+	ConstData cdata;
+	cdata.m_nPairs = nPairs;
+	cdata.m_collisionMargin = cfg.m_collisionMargin;
+	cdata.m_capacity = pairsOut->getSize();
+
+	u32 n = 0;
+	data->m_counterBuffer->write( &n, 1 );
+//	DeviceUtils::waitForCompletion( device );
+	{
+		BufferInfo bInfo[] = { BufferInfo( gPairsInNative, true ), BufferInfo( shapeBuffer ), BufferInfo( gBodyInNative ), 
+			BufferInfo( gPairsOutNative ), BufferInfo( data->m_counterBuffer ) };
+		Launcher launcher( data->m_device, data->m_supportCullingKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( nPairs, 64 );
+	}
+	data->m_counterBuffer->read( &n, 1 );
+	DeviceUtils::waitForCompletion( device );
+/*
+	if( gPairsInNative != pairs ) delete gPairsInNative;
+	if( gBodyInNative != bodyBuf ) delete gBodyInNative;
+	if( gPairsOutNative != pairsOut ) 
+	{
+		gPairsOutNative->read( pairsOut->m_ptr, n );
+		DeviceUtils::waitForCompletion( device );
+		delete gPairsOutNative;
+	}
+*/
+	BufferUtils::unmap<false>( gPairsInNative, pairs );
+	BufferUtils::unmap<false>( gBodyInNative, bodyBuf );
+	BufferUtils::unmap<true>( gPairsOutNative, pairsOut );
+
+	return min2((int)n, pairsOut->getSize() );
+}
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphaseKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphaseKernels.cl
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphaseKernels.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphaseKernels.h
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Solver.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Solver.h
@@ -0,0 +1,203 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+#ifndef __ADL_SOLVER_H
+#define __ADL_SOLVER_H
+
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+#include <AdlPrimitives/Search/BoundSearch.h>
+#include <AdlPrimitives/Sort/RadixSort.h>
+#include <AdlPrimitives/Scan/PrefixScan.h>
+#include <AdlPrimitives/Sort/RadixSort32.h>
+
+//#include <AdlPhysics/TypeDefinition.h>
+#include "AdlRigidBody.h"
+#include "AdlContact4.h"
+
+//#include "AdlPhysics/Batching/Batching.h>
+
+
+#define MYF4 float4
+#define MAKE_MYF4 make_float4
+
+//#define MYF4 float4sse
+//#define MAKE_MYF4 make_float4sse
+
+#include "AdlConstraint4.h"
+
+namespace adl
+{
+class SolverBase
+{
+	public:
+		
+
+		struct ConstraintData
+		{
+			ConstraintData(): m_b(0.f), m_appliedRambdaDt(0.f) {}
+
+			float4 m_linear; // have to be normalized
+			float4 m_angular0;
+			float4 m_angular1;
+			float m_jacCoeffInv;
+			float m_b;
+			float m_appliedRambdaDt;
+
+			u32 m_bodyAPtr;
+			u32 m_bodyBPtr;
+
+			bool isInvalid() const { return ((u32)m_bodyAPtr+(u32)m_bodyBPtr) == 0; }
+			float getFrictionCoeff() const { return m_linear.w; }
+			void setFrictionCoeff(float coeff) { m_linear.w = coeff; }
+		};
+
+		struct ConstraintCfg
+		{
+			ConstraintCfg( float dt = 0.f ): m_positionDrift( 0.005f ), m_positionConstraintCoeff( 0.2f ), m_dt(dt), m_staticIdx(-1) {}
+
+			float m_positionDrift;
+			float m_positionConstraintCoeff;
+			float m_dt;
+			bool m_enableParallelSolve;
+			float m_averageExtent;
+			int m_staticIdx;
+		};
+
+		static
+		__inline
+		Buffer<Contact4>* allocateContact4( const Device* device, int capacity )
+		{
+			return new Buffer<Contact4>( device, capacity );	
+		}
+
+		static
+		__inline
+		void deallocateContact4( Buffer<Contact4>* data ) { delete data; }
+
+		static
+		__inline
+		SolverData allocateConstraint4( const Device* device, int capacity )
+		{
+			return new Buffer<Constraint4>( device, capacity );
+		}
+
+		static
+		__inline
+		void deallocateConstraint4( SolverData data ) { delete (Buffer<Constraint4>*)data; }
+
+		static
+		__inline
+		void* allocateFrictionConstraint( const Device* device, int capacity, u32 type = 0 )
+		{
+			return 0;
+		}
+
+		static
+		__inline
+		void deallocateFrictionConstraint( void* data ) 
+		{
+		}
+
+		enum
+		{
+			N_SPLIT = 16,
+			N_BATCHES = 4,
+			N_OBJ_PER_SPLIT = 10,
+			N_TASKS_PER_BATCH = N_SPLIT*N_SPLIT,
+		};
+};
+
+template<DeviceType TYPE>
+class Solver : public SolverBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			Data() : m_nIterations(4){}
+
+			const Device* m_device;
+			void* m_parallelSolveData;
+			int m_nIterations;
+			Kernel* m_batchingKernel;
+			Kernel* m_batchSolveKernel;
+			Kernel* m_contactToConstraintKernel;
+			Kernel* m_setSortDataKernel;
+			Kernel* m_reorderContactKernel;
+			Kernel* m_copyConstraintKernel;
+			//typename RadixSort<TYPE>::Data* m_sort;
+			typename RadixSort32<TYPE>::Data* m_sort32;
+			typename BoundSearch<TYPE>::Data* m_search;
+			typename PrefixScan<TYPE>::Data* m_scan;
+			Buffer<SortData>* m_sortDataBuffer;
+			Buffer<Contact4>* m_contactBuffer;
+		};
+
+		enum
+		{
+			DYNAMIC_CONTACT_ALLOCATION_THRESHOLD = 2000000,
+		};
+
+		static
+		Data* allocate( const Device* device, int pairCapacity );
+
+		static
+		void deallocate( Data* data );
+
+		static
+		void reorderConvertToConstraints( Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+		const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+			int nContacts, const ConstraintCfg& cfg );
+
+		static
+		void solveContactConstraint( Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* inertiaBuf, 
+			SolverData constraint, void* additionalData, int n );
+
+//		static
+//		int createSolveTasks( int batchIdx, Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+//			SolverData constraint, int n, ThreadPool::Task* tasksOut[], int taskCapacity );
+
+
+		//private:
+		static
+		void convertToConstraints( Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+			const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+			int nContacts, const ConstraintCfg& cfg );
+
+		static
+		void sortContacts( Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+			Buffer<Contact4>* contactsIn, void* additionalData, 
+			int nContacts, const ConstraintCfg& cfg );
+
+		static
+		void batchContacts( Data* data, Buffer<Contact4>* contacts, int nContacts, Buffer<u32>* n, Buffer<u32>* offsets, int staticIdx );
+
+};
+
+#include "Solver.inl"
+#include "SolverHost.inl"
+};
+
+#undef MYF4
+#undef MAKE_MYF4
+
+#endif //__ADL_SOLVER_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Solver.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Solver.inl
@@ -0,0 +1,762 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#define PATH "..\\..\\dynamics\\basic_demo\\Stubs\\SolverKernels"
+#define BATCHING_PATH "..\\..\\dynamics\\basic_demo\\Stubs\\batchingKernels"
+
+#define KERNEL1 "SingleBatchSolveKernel"
+#define KERNEL2 "BatchSolveKernel"
+
+#define KERNEL3 "ContactToConstraintKernel"
+#define KERNEL4 "SetSortDataKernel"
+#define KERNEL5 "ReorderContactKernel"
+#include "SolverKernels.h"
+
+#include "batchingKernels.h"
+
+
+struct SolverDebugInfo
+{
+	int m_valInt0;
+	int m_valInt1;
+	int m_valInt2;
+	int m_valInt3;
+	
+	int m_valInt4;
+	int m_valInt5;
+	int m_valInt6;
+	int m_valInt7;
+
+	int m_valInt8;
+	int m_valInt9;
+	int m_valInt10;
+	int m_valInt11;
+
+	int	m_valInt12;
+	int	m_valInt13;
+	int	m_valInt14;
+	int	m_valInt15;
+
+
+	float m_val0;
+	float m_val1;
+	float m_val2;
+	float m_val3;
+};
+
+
+
+
+class SolverDeviceInl
+{
+public:
+	struct ParallelSolveData
+	{
+		Buffer<u32>* m_numConstraints;
+		Buffer<u32>* m_offsets;
+	};
+};
+
+template<DeviceType TYPE>
+typename Solver<TYPE>::Data* Solver<TYPE>::allocate( const Device* device, int pairCapacity )
+{
+		const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{solverKernelsCL, 0};
+#else
+		{0,0};
+#endif
+
+		const char* src2[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{batchingKernelsCL, 0};
+#else
+		{0,0};
+#endif
+
+
+	
+
+	Data* data = new Data;
+	data->m_device = device;
+	bool cacheBatchingKernel = true;
+	data->m_batchingKernel = device->getKernel( BATCHING_PATH, "CreateBatches",  "-I ..\\..\\ ", src2[TYPE],cacheBatchingKernel);
+	//data->m_batchingKernel = device->getKernel( BATCHING_PATH, "CreateBatches",  "-I ..\\..\\ ", 0,cacheBatchingKernel);
+	bool cacheSolverKernel  = true;
+
+	data->m_batchSolveKernel = device->getKernel( PATH, KERNEL2, "-I ..\\..\\ ", src[TYPE],cacheSolverKernel );
+	data->m_contactToConstraintKernel = device->getKernel( PATH, KERNEL3, 
+		"-I ..\\..\\ ", src[TYPE] );
+	data->m_setSortDataKernel = device->getKernel( PATH, KERNEL4, 
+		"-I ..\\..\\ ", src[TYPE] );
+	data->m_reorderContactKernel = device->getKernel( PATH, KERNEL5, 
+		"-I ..\\..\\ ", src[TYPE] );
+
+	data->m_copyConstraintKernel = device->getKernel( PATH, "CopyConstraintKernel", 
+		"-I ..\\..\\ ", src[TYPE] );
+
+	data->m_parallelSolveData = new SolverDeviceInl::ParallelSolveData;
+	{
+		SolverDeviceInl::ParallelSolveData* solveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+		solveData->m_numConstraints = new Buffer<u32>( device, N_SPLIT*N_SPLIT );
+		solveData->m_offsets = new Buffer<u32>( device, N_SPLIT*N_SPLIT );
+	}
+	const int sortSize = NEXTMULTIPLEOF( pairCapacity, 512 );
+
+
+	//data->m_sort = RadixSort<TYPE>::allocate( data->m_device, sortSize );//todo. remove hardcode this
+	data->m_sort32 = RadixSort32<TYPE>::allocate( data->m_device, sortSize );//todo. remove hardcode this
+	
+	data->m_search = BoundSearch<TYPE>::allocate( data->m_device, N_SPLIT*N_SPLIT );
+	data->m_scan = PrefixScan<TYPE>::allocate( data->m_device, N_SPLIT*N_SPLIT );
+
+	data->m_sortDataBuffer = new Buffer<SortData>( data->m_device, sortSize );
+
+	if( pairCapacity < DYNAMIC_CONTACT_ALLOCATION_THRESHOLD )
+		data->m_contactBuffer = new Buffer<Contact4>( data->m_device, pairCapacity );
+	else
+		data->m_contactBuffer = 0;
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::deallocate( Data* data )
+{
+	{
+		SolverDeviceInl::ParallelSolveData* solveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+		delete solveData->m_numConstraints;
+		delete solveData->m_offsets;
+		delete solveData;
+	}
+
+//	RadixSort<TYPE>::deallocate( data->m_sort );
+	RadixSort32<TYPE>::deallocate(data->m_sort32);
+	BoundSearch<TYPE>::deallocate( data->m_search );
+	PrefixScan<TYPE>::deallocate( data->m_scan );
+
+	delete data->m_sortDataBuffer;
+	if( data->m_contactBuffer ) delete data->m_contactBuffer;
+
+	delete data;
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::reorderConvertToConstraints( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+	const Buffer<RigidBodyBase::Inertia>* shapeBuf,
+	Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+	int nContacts, const typename Solver<TYPE>::ConstraintCfg& cfg )
+{
+	if( data->m_contactBuffer )
+	{
+		if( data->m_contactBuffer->getSize() < nContacts )
+		{
+			BT_PROFILE("delete data->m_contactBuffer;");
+			delete data->m_contactBuffer;
+			data->m_contactBuffer = 0;
+		}
+	}
+	if( data->m_contactBuffer == 0 )
+	{
+		BT_PROFILE("new data->m_contactBuffer;");
+
+		data->m_contactBuffer = new Buffer<Contact4>( data->m_device, nContacts );
+	}
+	Stopwatch sw;
+
+	Buffer<Contact4>* contactNative = BufferUtils::map<TYPE_CL, true>( data->m_device, contactsIn, nContacts );
+
+	//DeviceUtils::Config dhCfg;
+	//Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+	if( cfg.m_enableParallelSolve )
+	{
+		SolverDeviceInl::ParallelSolveData* nativeSolveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+
+		DeviceUtils::waitForCompletion( data->m_device );
+		sw.start();
+		//	contactsIn -> data->m_contactBuffer
+		{
+			BT_PROFILE("sortContacts");
+			Solver<TYPE>::sortContacts( data, bodyBuf, contactNative, additionalData, nContacts, cfg );
+			DeviceUtils::waitForCompletion( data->m_device );
+		}
+		sw.split();
+		if(0)
+		{
+			Contact4* tmp = new Contact4[nContacts];
+			data->m_contactBuffer->read( tmp, nContacts );
+			DeviceUtils::waitForCompletion( data->m_contactBuffer->m_device );
+			contactNative->write( tmp, nContacts );
+			DeviceUtils::waitForCompletion( contactNative->m_device );
+			delete [] tmp;
+		}
+		else
+		{
+			BT_PROFILE("m_copyConstraintKernel");
+
+			Buffer<int4> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+
+			int4 cdata; cdata.x = nContacts;
+			BufferInfo bInfo[] = { BufferInfo( data->m_contactBuffer ), BufferInfo( contactNative ) };
+//			Launcher launcher( data->m_device, data->m_device->getKernel( PATH, "CopyConstraintKernel",  "-I ..\\..\\ -Wf,--c++", 0 ) );
+			Launcher launcher( data->m_device, data->m_copyConstraintKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( constBuffer, cdata );
+			launcher.launch1D( nContacts, 64 );
+			DeviceUtils::waitForCompletion( data->m_device );
+		}
+		{
+			BT_PROFILE("batchContacts");
+			Solver<TYPE>::batchContacts( data, contactNative, nContacts, nativeSolveData->m_numConstraints, nativeSolveData->m_offsets, cfg.m_staticIdx );
+
+		}
+	}
+	{
+			BT_PROFILE("waitForCompletion (batchContacts)");
+			DeviceUtils::waitForCompletion( data->m_device );
+	}
+	sw.split();
+	//================
+	if(0)
+	{
+//		Solver<TYPE_HOST>::Data* solverHost = Solver<TYPE_HOST>::allocate( deviceHost, nContacts );
+//		Solver<TYPE_HOST>::convertToConstraints( solverHost, bodyBuf, shapeBuf, contactNative, contactCOut, additionalData, nContacts, cfg );
+//		Solver<TYPE_HOST>::deallocate( solverHost );
+	}
+	else
+	{
+		BT_PROFILE("convertToConstraints");
+		Solver<TYPE>::convertToConstraints( data, bodyBuf, shapeBuf, contactNative, contactCOut, additionalData, nContacts, cfg );
+	}
+	{
+		BT_PROFILE("convertToConstraints waitForCompletion");
+		DeviceUtils::waitForCompletion( data->m_device );
+	}
+	sw.stop();
+
+	{
+		BT_PROFILE("printf");
+
+		float t[5];
+		sw.getMs( t, 3 );
+//		printf("%3.2f, %3.2f, %3.2f, ", t[0], t[1], t[2]);
+	}
+
+	{
+		BT_PROFILE("deallocate and unmap");
+
+		//DeviceUtils::deallocate( deviceHost );
+
+		BufferUtils::unmap<true>( contactNative, contactsIn, nContacts );
+	}
+}
+
+
+template<DeviceType TYPE>
+void Solver<TYPE>::solveContactConstraint( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			SolverData constraint, void* additionalData, int n )
+{
+	if(0)
+	{
+		DeviceUtils::Config dhCfg;
+		Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+		{
+			Solver<TYPE_HOST>::Data* hostData = Solver<TYPE_HOST>::allocate( deviceHost, 0 );
+			Solver<TYPE_HOST>::solveContactConstraint( hostData, bodyBuf, shapeBuf, constraint, additionalData, n );
+			Solver<TYPE_HOST>::deallocate( hostData );
+		}
+		DeviceUtils::deallocate( deviceHost );
+		return;
+	}
+
+	ADLASSERT( data );
+
+	Buffer<Constraint4>* cBuffer =0;
+	
+	Buffer<RigidBodyBase::Body>* gBodyNative=0; 
+	Buffer<RigidBodyBase::Inertia>* gShapeNative =0;
+	Buffer<Constraint4>* gConstraintNative =0;
+	
+
+	{
+		BT_PROFILE("map");
+	cBuffer = (Buffer<Constraint4>*)constraint;
+
+		gBodyNative= BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );
+		gShapeNative= BufferUtils::map<TYPE, true>( data->m_device, shapeBuf );
+		gConstraintNative = BufferUtils::map<TYPE, true>( data->m_device, cBuffer );
+		DeviceUtils::waitForCompletion( data->m_device );
+	}
+
+	Buffer<int4> constBuffer;
+	int4 cdata = make_int4( n, 0, 0, 0 );
+	{
+		SolverDeviceInl::ParallelSolveData* solveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+		const int nn = N_SPLIT*N_SPLIT;
+
+		cdata.x = 0;
+		cdata.y = 250;
+
+#if 0
+//check how the cells are filled
+		unsigned int* hostCounts = new unsigned int[N_SPLIT*N_SPLIT];
+		solveData->m_numConstraints->read(hostCounts,N_SPLIT*N_SPLIT);
+		DeviceUtils::waitForCompletion( data->m_device );
+		for (int i=0;i<N_SPLIT*N_SPLIT;i++)
+		{
+			if (hostCounts[i])
+			{
+				printf("hostCounts[%d]=%d\n",i,hostCounts[i]);
+			}
+		}
+		delete[] hostCounts;
+#endif
+
+		int numWorkItems = 64*nn/N_BATCHES;
+#ifdef DEBUG_ME
+		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
+		adl::Buffer<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
+#endif
+
+
+
+		{
+
+			BT_PROFILE("m_batchSolveKernel iterations");
+			for(int iter=0; iter<data->m_nIterations; iter++)
+			{
+				for(int ib=0; ib<N_BATCHES; ib++)
+				{
+#ifdef DEBUG_ME
+					memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
+					gpuDebugInfo.write(debugInfo,numWorkItems);
+#endif
+
+
+					cdata.z = ib;
+					cdata.w = N_SPLIT;
+
+				
+
+					BufferInfo bInfo[] = { 
+
+						BufferInfo( gBodyNative ), 
+						BufferInfo( gShapeNative ), 
+						BufferInfo( gConstraintNative ),
+						BufferInfo( solveData->m_numConstraints ), 
+						BufferInfo( solveData->m_offsets ) 
+#ifdef DEBUG_ME
+						,	BufferInfo(&gpuDebugInfo)
+#endif
+						};
+
+					Launcher launcher( data->m_device, data->m_batchSolveKernel );
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+					launcher.setConst( constBuffer, cdata );
+					
+					launcher.launch1D( numWorkItems, 64 );
+
+#ifdef DEBUG_ME
+					DeviceUtils::waitForCompletion( data->m_device );
+					gpuDebugInfo.read(debugInfo,numWorkItems);
+					DeviceUtils::waitForCompletion( data->m_device );
+					for (int i=0;i<numWorkItems;i++)
+					{
+						if (debugInfo[i].m_valInt2>0)
+						{
+							printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2);
+						}
+
+						if (debugInfo[i].m_valInt3>0)
+						{
+							printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3);
+						}
+					}
+#endif //DEBUG_ME
+
+
+				}
+			}
+		
+			DeviceUtils::waitForCompletion( data->m_device );
+
+
+		}
+
+		cdata.x = 1;
+		{
+			BT_PROFILE("m_batchSolveKernel iterations2");
+			for(int iter=0; iter<data->m_nIterations; iter++)
+			{
+				for(int ib=0; ib<N_BATCHES; ib++)
+				{
+					cdata.z = ib;
+					cdata.w = N_SPLIT;
+
+					BufferInfo bInfo[] = { 
+						BufferInfo( gBodyNative ), 
+						BufferInfo( gShapeNative ), 
+						BufferInfo( gConstraintNative ),
+						BufferInfo( solveData->m_numConstraints ), 
+						BufferInfo( solveData->m_offsets )
+#ifdef DEBUG_ME
+						,BufferInfo(&gpuDebugInfo)
+#endif //DEBUG_ME
+					};
+					Launcher launcher( data->m_device, data->m_batchSolveKernel );
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+					launcher.setConst( constBuffer, cdata );
+					launcher.launch1D( 64*nn/N_BATCHES, 64 );
+				}
+			}
+			DeviceUtils::waitForCompletion( data->m_device );
+			
+		}
+#ifdef DEBUG_ME
+		delete[] debugInfo;
+#endif //DEBUG_ME
+	}
+
+	{
+		BT_PROFILE("unmap");
+	BufferUtils::unmap<true>( gBodyNative, bodyBuf );
+	BufferUtils::unmap<false>( gShapeNative, shapeBuf );
+	BufferUtils::unmap<true>( gConstraintNative, cBuffer );
+	DeviceUtils::waitForCompletion( data->m_device );
+	}
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::convertToConstraints( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+	const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+	Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+	int nContacts, const ConstraintCfg& cfg )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_CL );
+
+	Buffer<RigidBodyBase::Body>* bodyNative =0;
+	Buffer<RigidBodyBase::Inertia>* shapeNative =0;
+	Buffer<Contact4>* contactNative =0;
+	Buffer<Constraint4>* constraintNative =0;
+
+	{
+		BT_PROFILE("map buffers");
+
+		bodyNative = BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );
+		shapeNative  = BufferUtils::map<TYPE, true>( data->m_device, shapeBuf );
+		contactNative= BufferUtils::map<TYPE, true>( data->m_device, contactsIn );
+		constraintNative = BufferUtils::map<TYPE, false>( data->m_device, (Buffer<Constraint4>*)contactCOut );
+	}
+	struct CB
+	{
+		int m_nContacts;
+		float m_dt;
+		float m_positionDrift;
+		float m_positionConstraintCoeff;
+	};
+
+	{
+		BT_PROFILE("m_contactToConstraintKernel");
+		CB cdata;
+		cdata.m_nContacts = nContacts;
+		cdata.m_dt = cfg.m_dt;
+		cdata.m_positionDrift = cfg.m_positionDrift;
+		cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
+
+		Buffer<CB> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+		BufferInfo bInfo[] = { BufferInfo( contactNative ), BufferInfo( bodyNative ), BufferInfo( shapeNative ),
+			BufferInfo( constraintNative )};
+		Launcher launcher( data->m_device, data->m_contactToConstraintKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( nContacts, 64 );	
+		DeviceUtils::waitForCompletion( data->m_device );
+
+	}
+
+	{
+		BT_PROFILE("unmap");
+		BufferUtils::unmap<false>( bodyNative, bodyBuf );
+		BufferUtils::unmap<false>( shapeNative, shapeBuf );
+		BufferUtils::unmap<false>( contactNative, contactsIn );
+		BufferUtils::unmap<true>( constraintNative, (Buffer<Constraint4>*)contactCOut );
+	}
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::sortContacts( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+			Buffer<Contact4>* contactsIn, void* additionalData, 
+			int nContacts, const typename Solver<TYPE>::ConstraintCfg& cfg )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_CL );
+	Buffer<RigidBodyBase::Body>* bodyNative 
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, bodyBuf );
+	Buffer<Contact4>* contactNative 
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, contactsIn );
+
+	const int sortAlignment = 512; // todo. get this out of sort
+	if( cfg.m_enableParallelSolve )
+	{
+		SolverDeviceInl::ParallelSolveData* nativeSolveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+
+		int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
+
+		Buffer<u32>* countsNative = nativeSolveData->m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
+		Buffer<u32>* offsetsNative = nativeSolveData->m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
+
+		{	//	2. set cell idx
+			struct CB
+			{
+				int m_nContacts;
+				int m_staticIdx;
+				float m_scale;
+				int m_nSplit;
+			};
+
+			ADLASSERT( sortSize%64 == 0 );
+			CB cdata;
+			cdata.m_nContacts = nContacts;
+			cdata.m_staticIdx = cfg.m_staticIdx;
+			cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
+			cdata.m_nSplit = N_SPLIT;
+
+			Buffer<CB> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+			BufferInfo bInfo[] = { BufferInfo( contactNative ), BufferInfo( bodyNative ), BufferInfo( data->m_sortDataBuffer ) };
+			Launcher launcher( data->m_device, data->m_setSortDataKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( constBuffer, cdata );
+			launcher.launch1D( sortSize, 64 );
+		}
+
+		{	//	3. sort by cell idx
+			int n = N_SPLIT*N_SPLIT;
+			int sortBit = 32;
+			//if( n <= 0xffff ) sortBit = 16;
+			//if( n <= 0xff ) sortBit = 8;
+			RadixSort32<TYPE>::execute( data->m_sort32, *data->m_sortDataBuffer,sortSize);
+		}
+		{	//	4. find entries
+			BoundSearch<TYPE>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, N_SPLIT*N_SPLIT, BoundSearchBase::COUNT );
+
+			PrefixScan<TYPE>::execute( data->m_scan, *countsNative, *offsetsNative, N_SPLIT*N_SPLIT );
+		}
+
+		{	//	5. sort constraints by cellIdx
+			//	todo. preallocate this
+//			ADLASSERT( contactsIn->getType() == TYPE_HOST );
+//			Buffer<Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn );	//	copying contacts to this buffer
+
+			{
+				Buffer<int4> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+
+				int4 cdata; cdata.x = nContacts;
+				BufferInfo bInfo[] = { BufferInfo( contactNative ), BufferInfo( data->m_contactBuffer ), BufferInfo( data->m_sortDataBuffer ) };
+				Launcher launcher( data->m_device, data->m_reorderContactKernel );
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+				launcher.setConst( constBuffer, cdata );
+				launcher.launch1D( nContacts, 64 );
+			}
+//			BufferUtils::unmap<true>( out, contactsIn, nContacts );
+		}
+	}
+
+	BufferUtils::unmap<false>( bodyNative, bodyBuf );
+	BufferUtils::unmap<false>( contactNative, contactsIn );
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::batchContacts( typename Solver<TYPE>::Data* data, Buffer<Contact4>* contacts, int nContacts, Buffer<u32>* n, Buffer<u32>* offsets, int staticIdx )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_CL );
+
+	if(0)
+	{
+		BT_PROFILE("CPU classTestKernel/Kernel (batch generation?)");
+
+		DeviceUtils::Config dhCfg;
+		Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+		{
+			Solver<TYPE_HOST>::Data* hostData = Solver<TYPE_HOST>::allocate( deviceHost, 0 );
+			Solver<TYPE_HOST>::batchContacts( hostData, contacts, nContacts, n, offsets, staticIdx );
+			Solver<TYPE_HOST>::deallocate( hostData );
+		}
+		DeviceUtils::deallocate( deviceHost );
+		return;
+	}
+
+	Buffer<Contact4>* contactNative 
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, contacts, nContacts );
+	Buffer<u32>* nNative
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, n );
+	Buffer<u32>* offsetsNative
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, offsets );
+
+	{
+		BT_PROFILE("GPU classTestKernel/Kernel (batch generation?)");
+		Buffer<int4> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+		int4 cdata;
+		cdata.x = nContacts;
+		cdata.y = 0;
+		cdata.z = staticIdx;
+
+		int numWorkItems = 64*N_SPLIT*N_SPLIT;
+#ifdef BATCH_DEBUG
+		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
+		adl::Buffer<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
+		memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
+		gpuDebugInfo.write(debugInfo,numWorkItems);
+#endif
+
+
+		BufferInfo bInfo[] = { 
+			BufferInfo( contactNative ), 
+			BufferInfo( data->m_contactBuffer ), 
+			BufferInfo( nNative ), 
+			BufferInfo( offsetsNative ) 
+#ifdef BATCH_DEBUG
+			,	BufferInfo(&gpuDebugInfo)
+#endif
+		};
+
+		
+		
+		Launcher launcher( data->m_device, data->m_batchingKernel);
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( numWorkItems, 64 );
+		DeviceUtils::waitForCompletion( data->m_device );
+
+#ifdef BATCH_DEBUG
+	aaaa
+		Contact4* hostContacts = new Contact4[nContacts];
+		data->m_contactBuffer->read(hostContacts,nContacts);
+		DeviceUtils::waitForCompletion( data->m_device );
+
+		gpuDebugInfo.read(debugInfo,numWorkItems);
+		DeviceUtils::waitForCompletion( data->m_device );
+
+		for (int i=0;i<numWorkItems;i++)
+		{
+			if (debugInfo[i].m_valInt1>0)
+			{
+				printf("catch\n");
+			}
+			if (debugInfo[i].m_valInt2>0)
+			{
+				printf("catch22\n");
+			}
+
+			if (debugInfo[i].m_valInt3>0)
+			{
+				printf("catch666\n");
+			}
+
+			if (debugInfo[i].m_valInt4>0)
+			{
+				printf("catch777\n");
+			}
+		}
+		delete[] debugInfo;
+#endif //BATCH_DEBUG
+
+	}
+
+	if(0)
+	{
+		u32* nhost = new u32[N_SPLIT*N_SPLIT];
+
+		nNative->read( nhost, N_SPLIT*N_SPLIT );
+
+		Contact4* chost = new Contact4[nContacts];
+		data->m_contactBuffer->read( chost, nContacts );
+		DeviceUtils::waitForCompletion( data->m_device );
+		printf(">>");
+		int nonzero = 0;
+		u32 maxn = 0;
+		for(int i=0; i<N_SPLIT*N_SPLIT; i++)
+		{
+			printf("%d-", nhost[i]);
+			nonzero += (nhost[i]==0)? 0:1;
+			maxn = max2( nhost[i], maxn );
+		}
+		printf("\nnonzero:zero = %d:%d (%d)\n", nonzero, N_SPLIT*N_SPLIT-nonzero, maxn);
+		printf("\n\n");
+
+		int prev = 0;
+		int prevIdx = 0;
+		int maxNBatches = 0;
+		for(int i=0; i<nContacts; i++)
+		{
+//			printf("(%d, %d:%d),", chost[i].m_batchIdx, chost[i].m_bodyAPtr, chost[i].m_bodyBPtr);
+			if( prev != 0 && chost[i].m_batchIdx == 0 )
+			{
+				maxNBatches = max2( maxNBatches, prev );
+				printf("\n[%d]", prev);
+
+				//for(int j=prevIdx; j<i; j++)
+				//{
+				//	printf("(%d:%d),", chost[j].m_bodyAPtr, chost[j].m_bodyBPtr);
+				//}
+
+				//printf("\n");
+
+				prevIdx = i;
+			}
+
+			printf("%d,", chost[i].m_batchIdx);
+
+			prev = chost[i].m_batchIdx;
+		}
+		printf("\n");
+		printf("Max: %d\n", maxNBatches);
+
+		delete [] chost;
+		delete [] nhost;
+	}
+//	copy buffer to buffer
+	contactNative->write( *data->m_contactBuffer, nContacts );
+	DeviceUtils::waitForCompletion( data->m_device );
+
+	if(0)
+	{
+		DeviceUtils::Config dhCfg;
+		Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+		{
+			HostBuffer<Contact4> host( deviceHost, nContacts );
+			contactNative->read( host.m_ptr, nContacts );
+			DeviceUtils::waitForCompletion( data->m_device );
+
+			for(int i=0; i<nContacts; i++)
+			{
+				ADLASSERT( host[i].m_bodyAPtr <= (u32)staticIdx );
+				ADLASSERT( host[i].m_bodyBPtr <= (u32)staticIdx );
+			}
+		}
+		DeviceUtils::deallocate( deviceHost );
+	}
+
+	BufferUtils::unmap<true>( contactNative, contacts );
+	BufferUtils::unmap<false>( nNative, n );
+	BufferUtils::unmap<false>( offsetsNative, offsets );
+}
+
+#undef PATH
+#undef KERNEL1
+#undef KERNEL2
+
+#undef KERNEL3
+#undef KERNEL4
+#undef KERNEL5
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverHost.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverHost.inl
@@ -0,0 +1,848 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+class SolverInl
+{
+public:
+	typedef SolverBase::ConstraintData ConstraintData;
+
+
+	static
+	__forceinline
+	void setLinearAndAngular(const MYF4& n, const MYF4& r0, const MYF4& r1,
+							 MYF4& linear, MYF4& angular0, MYF4& angular1)
+	{
+		linear = -n;
+		angular0 = -cross3(r0, n);
+		angular1 = cross3(r1, n);
+	}
+
+	static
+	__forceinline
+	float calcJacCoeff(const MYF4& linear0, const MYF4& linear1, const MYF4& angular0, const MYF4& angular1,
+					  float invMass0, const Matrix3x3& invInertia0, float invMass1, const Matrix3x3& invInertia1)
+	{
+		//	linear0,1 are normlized
+		float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
+		float jmj1 = dot3F4(mtMul3(angular0,invInertia0), angular0);
+		float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
+		float jmj3 = dot3F4(mtMul3(angular1,invInertia1), angular1);
+		return -1.f/(jmj0+jmj1+jmj2+jmj3);
+	}
+	static
+	__forceinline
+	float calcRelVel(const MYF4& l0, const MYF4& l1, const MYF4& a0, const MYF4& a1, 
+					 const MYF4& linVel0, const MYF4& angVel0, const MYF4& linVel1, const MYF4& angVel1)
+	{
+		return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);
+	}
+
+	static
+	__forceinline
+	void setConstraint4( const MYF4& posA, const MYF4& linVelA, const MYF4& angVelA, float invMassA, const Matrix3x3& invInertiaA, 
+		const MYF4& posB, const MYF4& linVelB, const MYF4& angVelB, float invMassB, const Matrix3x3& invInertiaB, 
+		const Contact4& src, const SolverBase::ConstraintCfg& cfg, 
+		Constraint4& dstC )
+	{
+		dstC.m_bodyA = (u32)src.m_bodyAPtr;
+		dstC.m_bodyB = (u32)src.m_bodyBPtr;
+
+		float dtInv = 1.f/cfg.m_dt;
+		for(int ic=0; ic<4; ic++)
+		{
+			dstC.m_appliedRambdaDt[ic] = 0.f;
+		}
+		dstC.m_fJacCoeffInv[0] = dstC.m_fJacCoeffInv[1] = 0.f;
+
+
+		const MYF4& n = src.m_worldNormal;
+		dstC.m_linear = -n;
+		dstC.setFrictionCoeff( src.getFrictionCoeff() );
+		for(int ic=0; ic<4; ic++)
+		{
+			MYF4 r0 = src.m_worldPos[ic] - posA;
+			MYF4 r1 = src.m_worldPos[ic] - posB;
+
+			if( ic >= src.getNPoints() )
+			{
+				dstC.m_jacCoeffInv[ic] = 0.f;
+				continue;
+			}
+
+			float relVelN;
+			{
+				MYF4 linear, angular0, angular1;
+				setLinearAndAngular(n, r0, r1, linear, angular0, angular1);
+
+				dstC.m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,
+					invMassA, invInertiaA, invMassB, invInertiaB );
+
+				relVelN = calcRelVel(linear, -linear, angular0, angular1,
+					linVelA, angVelA, linVelB, angVelB);
+
+				float e = src.getRestituitionCoeff();
+				if( relVelN*relVelN < 0.004f ) e = 0.f;
+
+				dstC.m_b[ic] = e*relVelN;
+				dstC.m_b[ic] += (src.getPenetration(ic) + cfg.m_positionDrift)*cfg.m_positionConstraintCoeff*dtInv;
+				dstC.m_appliedRambdaDt[ic] = 0.f;
+			}
+		}
+
+		if( src.getNPoints() > 1 )
+		{	//	prepare friction
+			MYF4 center = MAKE_MYF4(0.f);
+			for(int i=0; i<src.getNPoints(); i++) center += src.m_worldPos[i];
+			center /= (float)src.getNPoints();
+
+			MYF4 tangent[2];
+			tangent[0] = cross3( src.m_worldNormal, src.m_worldPos[0]-center );
+			tangent[1] = cross3( tangent[0], src.m_worldNormal );
+			tangent[0] = normalize3( tangent[0] );
+			tangent[1] = normalize3( tangent[1] );
+			MYF4 r[2];
+			r[0] = center - posA;
+			r[1] = center - posB;
+
+			for(int i=0; i<2; i++)
+			{
+				MYF4 linear, angular0, angular1;
+				setLinearAndAngular(tangent[i], r[0], r[1], linear, angular0, angular1);
+
+				dstC.m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,
+					invMassA, invInertiaA, invMassB, invInertiaB );
+				dstC.m_fAppliedRambdaDt[i] = 0.f;
+			}
+			dstC.m_center = center;
+		}
+		else
+		{
+			//	single point constraint
+		}
+
+		for(int i=0; i<4; i++)
+		{
+			if( i<src.getNPoints() )
+			{
+				dstC.m_worldPos[i] = src.m_worldPos[i];
+			}
+			else
+			{
+				dstC.m_worldPos[i] = MAKE_MYF4(0.f);
+			}
+		}
+	}
+
+/*
+	struct Constraint4
+	{
+		float4 m_linear;			X
+		float4 m_angular0[4];		X
+		float4 m_angular1[4];		center
+		float m_jacCoeffInv[4];		[0,1]
+		float m_b[4];				X
+		float m_appliedRambdaDt[4];	[0,1]
+
+		void* m_bodyAPtr;			X
+		void* m_bodyBPtr;			X
+	};
+*/
+	static
+	__inline
+	void solveFriction(Constraint4& cs, 
+		const MYF4& posA, MYF4& linVelA, MYF4& angVelA, float invMassA, const Matrix3x3& invInertiaA,
+		const MYF4& posB, MYF4& linVelB, MYF4& angVelB, float invMassB, const Matrix3x3& invInertiaB, 
+		float maxRambdaDt[4], float minRambdaDt[4])
+	{
+		if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return;
+		const MYF4& center = cs.m_center;
+
+		MYF4 n = -cs.m_linear;
+
+		MYF4 tangent[2];
+		tangent[0] = cross3( n, cs.m_worldPos[0]-center );
+		tangent[1] = cross3( tangent[0], n );
+		tangent[0] = normalize3( tangent[0] );
+		tangent[1] = normalize3( tangent[1] );
+
+		MYF4 angular0, angular1, linear;
+		MYF4 r0 = center - posA;
+		MYF4 r1 = center - posB;
+		for(int i=0; i<2; i++)
+		{
+			setLinearAndAngular( tangent[i], r0, r1, linear, angular0, angular1 );
+			float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
+				linVelA, angVelA, linVelB, angVelB );
+			rambdaDt *= cs.m_fJacCoeffInv[i];
+
+				{
+					float prevSum = cs.m_fAppliedRambdaDt[i];
+					float updated = prevSum;
+					updated += rambdaDt;
+					updated = max2( updated, minRambdaDt[i] );
+					updated = min2( updated, maxRambdaDt[i] );
+					rambdaDt = updated - prevSum;
+					cs.m_fAppliedRambdaDt[i] = updated;
+				}
+
+			MYF4 linImp0 = invMassA*linear*rambdaDt;
+			MYF4 linImp1 = invMassB*(-linear)*rambdaDt;
+			MYF4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
+			MYF4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
+
+			linVelA += linImp0;
+			angVelA += angImp0;
+			linVelB += linImp1;
+			angVelB += angImp1;
+		}
+
+		{	//	angular damping for point constraint
+			MYF4 ab = normalize3( posB - posA );
+			MYF4 ac = normalize3( center - posA );
+			if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
+			{
+				float angNA = dot3F4( n, angVelA );
+				float angNB = dot3F4( n, angVelB );
+
+				angVelA -= (angNA*0.1f)*n;
+				angVelB -= (angNB*0.1f)*n;
+			}
+		}
+	}
+
+	template<bool JACOBI>
+	static
+	__inline
+	void solveContact(Constraint4& cs, 
+		const MYF4& posA, MYF4& linVelA, MYF4& angVelA, float invMassA, const Matrix3x3& invInertiaA,
+		const MYF4& posB, MYF4& linVelB, MYF4& angVelB, float invMassB, const Matrix3x3& invInertiaB, 
+		float maxRambdaDt[4], float minRambdaDt[4])
+	{
+		MYF4 dLinVelA = MAKE_MYF4(0.f);
+		MYF4 dAngVelA = MAKE_MYF4(0.f);
+		MYF4 dLinVelB = MAKE_MYF4(0.f);
+		MYF4 dAngVelB = MAKE_MYF4(0.f);
+
+		for(int ic=0; ic<4; ic++)
+		{
+			//	dont necessary because this makes change to 0
+			if( cs.m_jacCoeffInv[ic] == 0.f ) continue;
+
+			{
+				MYF4 angular0, angular1, linear;
+				MYF4 r0 = cs.m_worldPos[ic] - posA;
+				MYF4 r1 = cs.m_worldPos[ic] - posB;
+				setLinearAndAngular( -cs.m_linear, r0, r1, linear, angular0, angular1 );
+
+				float rambdaDt = calcRelVel(cs.m_linear, -cs.m_linear, angular0, angular1,
+					linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic];
+				rambdaDt *= cs.m_jacCoeffInv[ic];
+
+				{
+					float prevSum = cs.m_appliedRambdaDt[ic];
+					float updated = prevSum;
+					updated += rambdaDt;
+					updated = max2( updated, minRambdaDt[ic] );
+					updated = min2( updated, maxRambdaDt[ic] );
+					rambdaDt = updated - prevSum;
+					cs.m_appliedRambdaDt[ic] = updated;
+				}
+
+				MYF4 linImp0 = invMassA*linear*rambdaDt;
+				MYF4 linImp1 = invMassB*(-linear)*rambdaDt;
+				MYF4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
+				MYF4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
+
+				if( JACOBI )
+				{
+					dLinVelA += linImp0;
+					dAngVelA += angImp0;
+					dLinVelB += linImp1;
+					dAngVelB += angImp1;
+				}
+				else
+				{
+					linVelA += linImp0;
+					angVelA += angImp0;
+					linVelB += linImp1;
+					angVelB += angImp1;
+				}
+			}
+		}
+
+		if( JACOBI )
+		{
+			linVelA += dLinVelA;
+			angVelA += dAngVelA;
+			linVelB += dLinVelB;
+			angVelB += dAngVelB;
+		}
+	}
+
+	enum
+	{
+		N_SPLIT = SolverBase::N_SPLIT,
+	};
+
+	//	for parallel solve
+	struct ParallelSolveData
+	{
+		u32 m_n[N_SPLIT*N_SPLIT];
+		u32 m_offset[N_SPLIT*N_SPLIT];
+	};
+
+	static
+	__inline
+	int sortConstraintByBatch(Contact4* cs, int n, int ignoreIdx, int simdWidth = -1)
+	{
+		SortData* sortData;
+		{
+			BT_PROFILE("new");
+			sortData = new SortData[n];
+		}
+
+		u32* idxBuffer = new u32[n];
+		u32* idxSrc = idxBuffer;
+		u32* idxDst = idxBuffer;
+		int nIdxSrc, nIdxDst;
+
+		const int N_FLG = 256;
+		const int FLG_MASK = N_FLG-1;
+		u32 flg[N_FLG/32];
+#if defined(_DEBUG)
+		for(int i=0; i<n; i++) cs[i].getBatchIdx() = -1; 
+#endif
+		for(int i=0; i<n; i++) idxSrc[i] = i;
+		nIdxSrc = n;
+
+		int batchIdx = 0;
+
+		{
+			BT_PROFILE("batching");
+			while( nIdxSrc )
+			{
+				nIdxDst = 0;
+				int nCurrentBatch = 0;
+
+				//	clear flag
+				for(int i=0; i<N_FLG/32; i++) flg[i] = 0;
+
+				for(int i=0; i<nIdxSrc; i++)
+				{
+					int idx = idxSrc[i];
+					ADLASSERT( idx < n );
+					//	check if it can go
+					int aIdx = cs[idx].m_bodyAPtr & FLG_MASK;
+					int bIdx = cs[idx].m_bodyBPtr & FLG_MASK;
+
+					u32 aUnavailable = flg[ aIdx/32 ] & (1<<(aIdx&31));
+					u32 bUnavailable = flg[ bIdx/32 ] & (1<<(bIdx&31));
+
+					aUnavailable = (ignoreIdx==cs[idx].m_bodyAPtr)? 0:aUnavailable;
+					bUnavailable = (ignoreIdx==cs[idx].m_bodyBPtr)? 0:bUnavailable;
+
+					if( aUnavailable==0 && bUnavailable==0 ) // ok 
+					{
+						flg[ aIdx/32 ] |= (1<<(aIdx&31));
+						flg[ bIdx/32 ] |= (1<<(bIdx&31));
+						cs[idx].getBatchIdx() = batchIdx;
+						sortData[idx].m_key = batchIdx;
+						sortData[idx].m_value = idx;
+
+						{
+							nCurrentBatch++;
+							if( nCurrentBatch == simdWidth )
+							{
+								nCurrentBatch = 0;
+								for(int i=0; i<N_FLG/32; i++) flg[i] = 0;
+							}
+						}
+					}
+					else
+					{
+						idxDst[nIdxDst++] = idx;
+					}
+				}
+				swap2( idxSrc, idxDst );
+				swap2( nIdxSrc, nIdxDst );
+				batchIdx ++;
+			}
+		}
+
+		
+
+		{
+			BT_PROFILE("radix sort data");
+			//	sort SortData
+			Device::Config cfg;
+			Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, cfg );
+			{
+				Buffer<SortData> sortBuffer; sortBuffer.setRawPtr( deviceHost, sortData, n );
+				RadixSort<TYPE_HOST>::Data* sort = RadixSort<TYPE_HOST>::allocate( deviceHost, n );
+
+				RadixSort<TYPE_HOST>::execute( sort, sortBuffer, n );
+
+				RadixSort<TYPE_HOST>::deallocate( sort );
+			}
+			DeviceUtils::deallocate( deviceHost );
+		}
+
+		{	
+				BT_PROFILE("reorder");
+			//	reorder
+			Contact4* old = new Contact4[n];
+			memcpy( old, cs, sizeof(Contact4)*n);
+			for(int i=0; i<n; i++)
+			{
+				int idx = sortData[i].m_value;
+				cs[i] = old[idx];
+			}
+			delete [] old;
+		}
+
+		{
+			BT_PROFILE("delete");
+			delete [] idxBuffer;
+			delete [] sortData;
+		}
+#if defined(_DEBUG)
+//		debugPrintf( "nBatches: %d\n", batchIdx );
+		for(int i=0; i<n; i++) ADLASSERT( cs[i].getBatchIdx() != -1 );
+#endif
+		return batchIdx;
+	}
+};
+
+
+
+enum
+{
+//	N_SPLIT = SOLVER_N_SPLIT,
+//	MAX_TASKS_PER_BATCH = N_SPLIT*N_SPLIT/4,
+};
+
+struct SolveTask// : public ThreadPool::Task
+{
+	SolveTask(const Buffer<RigidBodyBase::Body>* bodies, const Buffer<RigidBodyBase::Inertia>* shapes, const Buffer<Constraint4>* constraints,
+		int start, int nConstraints)
+		: m_bodies( bodies ), m_shapes( shapes ), m_constraints( constraints ), m_start( start ), m_nConstraints( nConstraints ),
+		m_solveFriction( true ){}
+
+	u16 getType(){ return 0; }
+
+	void run(int tIdx)
+	{
+		HostBuffer<RigidBodyBase::Body>& hBody = *(HostBuffer<RigidBodyBase::Body>*)m_bodies;
+		HostBuffer<RigidBodyBase::Inertia>& hShape = *(HostBuffer<RigidBodyBase::Inertia>*)m_shapes;
+		HostBuffer<Constraint4>& hc = *(HostBuffer<Constraint4>*)m_constraints;
+
+		for(int ic=0; ic<m_nConstraints; ic++)
+		{
+			int i = m_start + ic;
+
+			float frictionCoeff = hc[i].getFrictionCoeff();
+			int aIdx = (int)hc[i].m_bodyA;
+			int bIdx = (int)hc[i].m_bodyB;
+			RigidBodyBase::Body& bodyA = hBody[aIdx];
+			RigidBodyBase::Body& bodyB = hBody[bIdx];
+
+			if( !m_solveFriction )
+			{
+				float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+				float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+
+				SolverInl::solveContact<false>( hc[i], bodyA.m_pos, (MYF4&)bodyA.m_linVel, (MYF4&)bodyA.m_angVel, bodyA.m_invMass, hShape[aIdx].m_invInertia, 
+					bodyB.m_pos, (MYF4&)bodyB.m_linVel, (MYF4&)bodyB.m_angVel, bodyB.m_invMass, hShape[bIdx].m_invInertia,
+					maxRambdaDt, minRambdaDt );
+			}
+			else
+			{
+				float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+				float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+
+				float sum = 0;
+				for(int j=0; j<4; j++)
+				{
+					sum +=hc[i].m_appliedRambdaDt[j];
+				}
+				frictionCoeff = 0.7f;
+				for(int j=0; j<4; j++)
+				{
+					maxRambdaDt[j] = frictionCoeff*sum;
+					minRambdaDt[j] = -maxRambdaDt[j];
+				}
+
+				SolverInl::solveFriction( hc[i], bodyA.m_pos, (MYF4&)bodyA.m_linVel, (MYF4&)bodyA.m_angVel, bodyA.m_invMass, hShape[aIdx].m_invInertia, 
+					bodyB.m_pos, (MYF4&)bodyB.m_linVel, (MYF4&)bodyB.m_angVel, bodyB.m_invMass, hShape[bIdx].m_invInertia,
+					maxRambdaDt, minRambdaDt );
+			}
+		}
+	}
+
+	const Buffer<RigidBodyBase::Body>* m_bodies;
+	const Buffer<RigidBodyBase::Inertia>* m_shapes;
+	const Buffer<Constraint4>* m_constraints;
+	int m_start;
+	int m_nConstraints;
+	bool m_solveFriction;
+};
+
+
+template<>
+static Solver<adl::TYPE_HOST>::Data* Solver<adl::TYPE_HOST>::allocate( const Device* device, int pairCapacity )
+{
+	Solver<adl::TYPE_HOST>::Data* data = new Data;
+	data->m_device = device;
+	data->m_parallelSolveData = 0;
+
+	return data;
+}
+
+template<>
+static void Solver<adl::TYPE_HOST>::deallocate( Solver<TYPE_HOST>::Data* data )
+{
+	if( data->m_parallelSolveData ) delete (SolverInl::ParallelSolveData*)data->m_parallelSolveData;
+	delete data;
+}
+
+
+void sortContacts2(  Solver<TYPE_HOST>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+			Buffer<Contact4>* contactsIn, void* additionalData, 
+			int nContacts, const Solver<TYPE_HOST>::ConstraintCfg& cfg )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_HOST );
+	HostBuffer<RigidBodyBase::Body>* bodyNative 
+		= (HostBuffer<RigidBodyBase::Body>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, bodyBuf );
+	HostBuffer<Contact4>* contactNative 
+		= (HostBuffer<Contact4>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, contactsIn);
+
+	if( cfg.m_enableParallelSolve )
+	{
+		ADLASSERT( data->m_parallelSolveData == 0 );
+		data->m_parallelSolveData = new SolverInl::ParallelSolveData;
+		SolverInl::ParallelSolveData* solveData = (SolverInl::ParallelSolveData*)data->m_parallelSolveData;
+
+		HostBuffer<SortData> sortData( data->m_device, nContacts );
+		{	//	2. set cell idx
+			float spacing = adl::SolverBase::N_OBJ_PER_SPLIT*cfg.m_averageExtent;
+			float xScale = 1.f/spacing;
+			for(int i=0; i<nContacts; i++)
+			{
+				int idx = ((*contactNative)[i].m_bodyAPtr==cfg.m_staticIdx)? (*contactNative)[i].m_bodyBPtr:(*contactNative)[i].m_bodyAPtr;
+				float4& p = (*bodyNative)[idx].m_pos;
+				int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*xScale)&(adl::SolverBase::N_SPLIT-1);
+				int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*xScale)&(adl::SolverBase::N_SPLIT-1);
+				ADLASSERT( xIdx >= 0 && xIdx < adl::SolverBase::N_SPLIT );
+				ADLASSERT( zIdx >= 0 && zIdx < adl::SolverBase::N_SPLIT );
+				sortData[i].m_key = (xIdx+zIdx*adl::SolverBase::N_SPLIT);
+				sortData[i].m_value = i;
+			}
+		}
+
+		{	//	3. sort by cell idx
+			RadixSort<TYPE_HOST>::Data* sData = RadixSort<TYPE_HOST>::allocate( data->m_device, nContacts );
+
+			RadixSort<TYPE_HOST>::execute( sData, sortData, nContacts );
+
+			RadixSort<TYPE_HOST>::deallocate( sData );
+		}
+
+		{	//	4. find entries
+			HostBuffer<u32> counts; counts.setRawPtr( data->m_device, solveData->m_n, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+			HostBuffer<u32> offsets; offsets.setRawPtr( data->m_device, solveData->m_offset, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+			{
+				BoundSearch<TYPE_HOST>::Data* sData = BoundSearch<TYPE_HOST>::allocate( data->m_device );
+				PrefixScan<TYPE_HOST>::Data* pData = PrefixScan<TYPE_HOST>::allocate( data->m_device, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+
+				BoundSearch<TYPE_HOST>::execute( sData, sortData, nContacts, counts, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT, BoundSearchBase::COUNT );
+
+				PrefixScan<TYPE_HOST>::execute( pData, counts, offsets, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+				
+				BoundSearch<TYPE_HOST>::deallocate( sData );
+				PrefixScan<TYPE_HOST>::deallocate( pData );
+			}
+#if defined(_DEBUG)
+			{
+				HostBuffer<u32> n0( data->m_device, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+				HostBuffer<u32> offset0( data->m_device, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+				for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
+				{
+					n0[i] = 0;
+					offset0[i] = 0;
+				}
+
+				for(int i=0; i<nContacts; i++)
+				{
+					int idx = sortData[i].m_key;
+					n0[idx]++;
+				}
+
+				//	scan
+				int sum = 0;
+				for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
+				{
+					offset0[i] = sum;
+					sum += n0[i];
+				}
+
+				for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
+				{
+					ADLASSERT( n0[i] == counts[i] );
+					ADLASSERT( offset0[i] == offsets[i] );
+				}
+			}
+#endif
+		}
+
+		{	//	5. sort constraints by cellIdx
+			Contact4* old = new Contact4[nContacts];
+			memcpy( old, contactNative->m_ptr, sizeof(Contact4)*nContacts );
+			for(int i=0; i<nContacts; i++)
+			{
+				int srcIdx = sortData[i].m_value;
+				(*contactNative)[i] = old[srcIdx];
+			}
+			delete [] old;
+		}
+	}
+
+	BufferUtils::unmap<false>( bodyNative, bodyBuf );
+	BufferUtils::unmap<true>( contactNative, contactsIn );
+}
+
+static void reorderConvertToConstraints2( Solver<TYPE_HOST>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+	const Buffer<RigidBodyBase::Inertia>* shapeBuf,
+	adl::Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+	int nContacts, const Solver<TYPE_HOST>::ConstraintCfg& cfg )
+{
+	
+	
+	sortContacts2( data, bodyBuf, contactsIn, additionalData, nContacts, cfg );
+
+	{
+		SolverInl::ParallelSolveData* solveData = (SolverInl::ParallelSolveData*)data->m_parallelSolveData;
+		Buffer<u32> n; n.setRawPtr( data->m_device, solveData->m_n, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+		Buffer<u32> offsets; offsets.setRawPtr( data->m_device, solveData->m_offset, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+		Solver<TYPE_HOST>::batchContacts( data, contactsIn, nContacts, &n, &offsets, cfg.m_staticIdx );
+		printf("hello\n");
+	}
+	
+	Solver<TYPE_HOST>::convertToConstraints( data, bodyBuf, shapeBuf, contactsIn, contactCOut, additionalData, nContacts, cfg );
+}
+
+template<DeviceType TYPE>
+static void solveContactConstraint(  Solver<TYPE_HOST>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			SolverData constraint, void* additionalData, int n )
+{
+
+	Buffer<RigidBodyBase::Body>* bodyNative
+		= BufferUtils::map<TYPE_HOST, true>( data->m_device, bodyBuf );
+	Buffer<RigidBodyBase::Inertia>* shapeNative
+		= BufferUtils::map<TYPE_HOST, true>( data->m_device, shapeBuf );
+	Buffer<Constraint4>* constraintNative
+		= BufferUtils::map<TYPE_HOST, true>( data->m_device, (const Buffer<Constraint4>*)constraint );
+
+	for(int iter=0; iter<data->m_nIterations; iter++)
+	{
+		SolveTask task( bodyNative, shapeNative, constraintNative, 0, n );
+		task.m_solveFriction = false;
+		task.run(0);
+	}
+
+	for(int iter=0; iter<data->m_nIterations; iter++)
+	{
+		SolveTask task( bodyNative, shapeNative, constraintNative, 0, n );
+		task.m_solveFriction = true;
+		task.run(0);
+	}
+
+	BufferUtils::unmap<true>( bodyNative, bodyBuf );
+	BufferUtils::unmap<false>( shapeNative, shapeBuf );
+	BufferUtils::unmap<false>( constraintNative, (const Buffer<Constraint4>*)constraint );
+}
+
+#if 0
+static
+int createSolveTasks( int batchIdx, Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			SolverData constraint, int n, ThreadPool::Task* tasksOut[], int taskCapacity )
+{
+/*
+	ADLASSERT( (N_SPLIT&1) == 0 );
+	ADLASSERT( batchIdx < N_BATCHES );
+	ADLASSERT( data->m_device->m_type == TYPE_HOST );
+	ADLASSERT( data->m_parallelSolveData );
+
+	SolverInl::ParallelSolveData* solveData = (SolverInl::ParallelSolveData*)data->m_parallelSolveData;
+	data->m_batchIdx = 0;
+
+	const int nx = N_SPLIT/2;
+
+	int nTasksCreated = 0;
+
+//	for(int ii=0; ii<2; ii++)
+	for(batchIdx=0; batchIdx<4; batchIdx++)
+	{
+		int2 offset = make_int2( batchIdx&1, batchIdx>>1 );
+		for(int ix=0; ix<nx; ix++) for(int iy=0; iy<nx; iy++)
+		{
+			int xIdx = ix*2 + offset.x;
+			int yIdx = iy*2 + offset.y;
+			int cellIdx = xIdx+yIdx*N_SPLIT;
+
+			int n = solveData->m_n[cellIdx];
+			int start = solveData->m_offset[cellIdx];
+
+			if( n == 0 ) continue;
+
+			SolveTask* task = new SolveTask( bodyBuf, shapeBuf, (const Buffer<Constraint4>*)constraint, start, n );
+//			task->m_solveFriction = (ii==0)? false:true;
+			tasksOut[nTasksCreated++] = task;
+		}
+	}
+
+	return nTasksCreated;
+*/
+	ADLASSERT(0);
+	return 0;
+}
+#endif
+
+
+
+static void convertToConstraints2(  Solver<TYPE_HOST>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+	const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+	Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+	int nContacts, const Solver<TYPE_HOST>::ConstraintCfg& cfg )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_HOST );
+
+	HostBuffer<RigidBodyBase::Body>* bodyNative 
+		= (HostBuffer<RigidBodyBase::Body>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, bodyBuf );
+	HostBuffer<RigidBodyBase::Inertia>* shapeNative 
+		= (HostBuffer<RigidBodyBase::Inertia>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, shapeBuf );
+	HostBuffer<Contact4>* contactNative 
+		= (HostBuffer<Contact4>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, contactsIn );
+	HostBuffer<Constraint4>* constraintNative 
+		= (HostBuffer<Constraint4>*)BufferUtils::map<TYPE_HOST, false>( data->m_device, (Buffer<Constraint4>*)contactCOut );
+
+	{
+#if !defined(_DEBUG)
+#pragma omp parallel for
+#endif
+		for(int i=0; i<nContacts; i++)
+		{
+//			new (constraintNative+i)Constraint4;
+			Contact4& contact = (*contactNative)[i];
+
+			if( contact.isInvalid() ) continue;
+
+			int aIdx = (int)contact.m_bodyAPtr;
+			int bIdx = (int)contact.m_bodyBPtr;
+
+			{
+				const RigidBodyBase::Body& bodyA = (*bodyNative)[aIdx];
+				const RigidBodyBase::Body& bodyB = (*bodyNative)[bIdx];
+				MYF4 posA( bodyA.m_pos );
+				MYF4 linVelA( bodyA.m_linVel );
+				MYF4 angVelA( bodyA.m_angVel );
+				MYF4 posB( bodyB.m_pos );
+				MYF4 linVelB( bodyB.m_linVel );
+				MYF4 angVelB( bodyB.m_angVel );
+
+				bool aIsInactive = ( isZero( linVelA ) && isZero( angVelA ) );
+				bool bIsInactive = ( isZero( linVelB ) && isZero( angVelB ) );
+
+				SolverInl::setConstraint4( posA, linVelA, angVelA, 
+					//(*bodyNative)[aIdx].m_invMass, (*shapeNative)[aIdx].m_invInertia,
+					(aIsInactive)? 0.f : (*bodyNative)[aIdx].m_invMass, (aIsInactive)? mtZero() : (*shapeNative)[aIdx].m_invInertia,
+					posB, linVelB, angVelB, 
+					//(*bodyNative)[bIdx].m_invMass, (*shapeNative)[bIdx].m_invInertia, 
+					(bIsInactive)? 0.f : (*bodyNative)[bIdx].m_invMass, (bIsInactive)? mtZero() : (*shapeNative)[bIdx].m_invInertia, 
+					contact, cfg, 
+					(*constraintNative)[i] );
+				(*constraintNative)[i].m_batchIdx = contact.getBatchIdx();
+			}
+		}
+	}
+
+	BufferUtils::unmap<false>( bodyNative, bodyBuf );
+	BufferUtils::unmap<false>( shapeNative, shapeBuf );
+	BufferUtils::unmap<false>( contactNative, contactsIn );
+	BufferUtils::unmap<true>( constraintNative, (Buffer<Constraint4>*)contactCOut );
+}
+
+
+
+
+
+static void batchContacts2(  Solver<TYPE_HOST>::Data* data, Buffer<Contact4>* contacts, int nContacts, Buffer<u32>* n, Buffer<u32>* offsets, int staticIdx )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_HOST );
+
+	HostBuffer<Contact4>* contactNative =0;
+	HostBuffer<u32>* nNative =0;
+	HostBuffer<u32>* offsetsNative =0;
+
+	int sz = sizeof(Contact4);
+	int sz2 = sizeof(int2);
+	{
+		BT_PROFILE("BufferUtils::map");
+		contactNative  = (HostBuffer<Contact4>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, contacts, nContacts );
+	}
+	{
+		BT_PROFILE("BufferUtils::map2");
+		nNative = (HostBuffer<u32>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, n );
+		offsetsNative= (HostBuffer<u32>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, offsets );
+	}
+
+	
+	{
+		BT_PROFILE("sortConstraintByBatch");
+		int numNonzeroGrid=0;
+		int maxNumBatches = 0;
+
+		for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
+		{
+			int n = (*nNative)[i];
+			int offset = (*offsetsNative)[i];
+
+			if( n ) 
+			{
+				numNonzeroGrid++;
+				int numBatches = SolverInl::sortConstraintByBatch( contactNative->m_ptr+offset, n, staticIdx,-1 );	//	on GPU
+				maxNumBatches = max(numBatches,maxNumBatches);
+
+	//			SolverInl::sortConstraintByBatch( contactNative->m_ptr+offset, n, staticIdx );	//	on CPU
+			}
+		}
+
+		printf("maxNumBatches = %d\n", maxNumBatches);
+	}
+
+	{
+		BT_PROFILE("BufferUtils::unmap");
+		BufferUtils::unmap<true>( contactNative, contacts, nContacts );
+	}
+	{
+		BT_PROFILE("BufferUtils::unmap2");
+		BufferUtils::unmap<false>( nNative, n );
+		BufferUtils::unmap<false>( offsetsNative, offsets );
+	}
+
+
+}
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverKernels.cl
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverKernels.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverKernels.h
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/batchingKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/batchingKernels.cl
@@ -0,0 +1,338 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#else
+#define counter32_t volatile __global int*
+#endif
+
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+
+
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+
+#define max2 max
+#define min2 min
+
+
+#define WG_SIZE 64
+
+
+
+typedef struct 
+{
+	float4 m_worldPos[4];
+	float4 m_worldNormal;
+	u32 m_coeffs;
+	int m_batchIdx;
+
+	u32 m_bodyA;
+	u32 m_bodyB;
+}Contact4;
+
+typedef struct 
+{
+	int m_n;
+	int m_start;
+	int m_staticIdx;
+	int m_paddings[1];
+} ConstBuffer;
+
+typedef struct 
+{
+	u32 m_a;
+	u32 m_b;
+	u32 m_idx;
+}Elem;
+
+#define STACK_SIZE (WG_SIZE*10)
+//#define STACK_SIZE (WG_SIZE)
+#define RING_SIZE 1024
+#define RING_SIZE_MASK (RING_SIZE-1)
+#define CHECK_SIZE (WG_SIZE)
+
+
+#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)
+#define RING_END ldsTmp
+
+u32 readBuf(__local u32* buff, int idx)
+{
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+	return buff[bufIdx] & (1<<bitIdx);
+}
+
+void writeBuf(__local u32* buff, int idx)
+{
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+//	buff[bufIdx] |= (1<<bitIdx);
+	atom_or( &buff[bufIdx], (1<<bitIdx) );
+}
+
+u32 tryWrite(__local u32* buff, int idx)
+{
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );
+	return ((ans >> bitIdx)&1) == 0;
+}
+
+//	batching on the GPU
+__kernel void CreateBatches( __global Contact4* gConstraints, __global Contact4* gConstraintsOut,
+		__global u32* gN, __global u32* gStart, 
+		ConstBuffer cb )
+{
+	__local u32 ldsStackIdx[STACK_SIZE];
+	__local u32 ldsStackEnd;
+	__local Elem ldsRingElem[RING_SIZE];
+	__local u32 ldsRingEnd;
+	__local u32 ldsTmp;
+	__local u32 ldsCheckBuffer[CHECK_SIZE];
+	__local u32 ldsFixedBuffer[CHECK_SIZE];
+	__local u32 ldsGEnd;
+	__local u32 ldsDstEnd;
+
+	int wgIdx = GET_GROUP_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	
+	const int m_n = gN[wgIdx];
+	const int m_start = gStart[wgIdx];
+	const int m_staticIdx = cb.m_staticIdx;
+		
+	if( lIdx == 0 )
+	{
+		ldsRingEnd = 0;
+		ldsGEnd = 0;
+		ldsStackEnd = 0;
+		ldsDstEnd = m_start;
+	}
+	
+//	while(1)
+	for(int ie=0; ie<250; ie++)
+	{
+		ldsFixedBuffer[lIdx] = 0;
+
+		for(int giter=0; giter<4; giter++)
+		{
+			int ringCap = GET_RING_CAPACITY;
+		
+			//	1. fill ring
+			if( ldsGEnd < m_n )
+			{
+				while( ringCap > WG_SIZE )
+				{
+					if( ldsGEnd >= m_n ) break;
+					if( lIdx < ringCap - WG_SIZE )
+					{
+						int srcIdx;
+						AtomInc1( ldsGEnd, srcIdx );
+						if( srcIdx < m_n )
+						{
+							int dstIdx;
+							AtomInc1( ldsRingEnd, dstIdx );
+							
+							int a = gConstraints[m_start+srcIdx].m_bodyA;
+							int b = gConstraints[m_start+srcIdx].m_bodyB;
+							ldsRingElem[dstIdx].m_a = (a>b)? b:a;
+							ldsRingElem[dstIdx].m_b = (a>b)? a:b;
+							ldsRingElem[dstIdx].m_idx = srcIdx;
+						}
+					}
+					ringCap = GET_RING_CAPACITY;
+				}
+			}
+
+			GROUP_LDS_BARRIER;
+	
+			//	2. fill stack
+			__local Elem* dst = ldsRingElem;
+			if( lIdx == 0 ) RING_END = 0;
+
+			int srcIdx=lIdx;
+			int end = ldsRingEnd;
+
+			{
+				for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)
+				{
+					Elem e;
+					if(srcIdx<end) e = ldsRingElem[srcIdx];
+					bool done = (srcIdx<end)?false:true;
+
+					for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;
+					
+					if( !done )
+					{
+						int aUsed = readBuf( ldsFixedBuffer, e.m_a);
+						int bUsed = readBuf( ldsFixedBuffer, e.m_b);
+
+						if( aUsed==0 && bUsed==0 )
+						{
+							int aAvailable;
+							int bAvailable;
+
+							aAvailable = tryWrite( ldsCheckBuffer, e.m_a );
+							bAvailable = tryWrite( ldsCheckBuffer, e.m_b );
+
+							//aAvailable = (m_staticIdx == e.m_a)? 1: aAvailable;
+							//bAvailable = (m_staticIdx == e.m_b)? 1: bAvailable;
+
+							bool success = (aAvailable && bAvailable);
+							if(success)
+							{
+								writeBuf( ldsFixedBuffer, e.m_a );
+								writeBuf( ldsFixedBuffer, e.m_b );
+							}
+							done = success;
+						}
+					}
+
+					//	put it aside
+					if(srcIdx<end)
+					{
+						if( done )
+						{
+							int dstIdx; AtomInc1( ldsStackEnd, dstIdx );
+							if( dstIdx < STACK_SIZE )
+								ldsStackIdx[dstIdx] = e.m_idx;
+							else{
+								done = false;
+								AtomAdd( ldsStackEnd, -1 );
+							}
+						}
+						if( !done )
+						{
+							int dstIdx; AtomInc1( RING_END, dstIdx );
+							dst[dstIdx] = e;
+						}
+					}
+
+					//	if filled, flush
+					if( ldsStackEnd == STACK_SIZE )
+					{
+						for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)
+						{
+							int idx = m_start + ldsStackIdx[i];
+							int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
+							gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
+							gConstraintsOut[ dstIdx ].m_batchIdx = ie;
+						}
+						if( lIdx == 0 ) ldsStackEnd = 0;
+
+						//for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) 
+						ldsFixedBuffer[lIdx] = 0;
+					}
+				}
+			}
+
+			if( lIdx == 0 ) ldsRingEnd = RING_END;
+		}
+
+		GROUP_LDS_BARRIER;
+
+		for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)
+		{
+			int idx = m_start + ldsStackIdx[i];
+			int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
+			gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
+			gConstraintsOut[ dstIdx ].m_batchIdx = ie;
+		}
+
+		//	in case it couldn't consume any pair. Flush them
+		//	todo. Serial batch worth while?
+		if( ldsStackEnd == 0 )
+		{
+			for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)
+			{
+				int idx = m_start + ldsRingElem[i].m_idx;
+				int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
+				gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
+				gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;
+			}
+			GROUP_LDS_BARRIER;
+			if( lIdx == 0 ) ldsRingEnd = 0;
+		}
+
+		if( lIdx == 0 ) ldsStackEnd = 0;
+
+		GROUP_LDS_BARRIER;
+
+		//	termination
+		if( ldsGEnd == m_n && ldsRingEnd == 0 )
+			break;
+	}
+
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/batchingKernels.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/batchingKernels.h
@@ -0,0 +1,371 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+static const char* batchingKernelsCL= \
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#else\n"
+"#define counter32_t volatile __global int*\n"
+"#endif\n"
+"\n"
+"\n"
+"typedef unsigned int u32;\n"
+"typedef unsigned short u16;\n"
+"typedef unsigned char u8;\n"
+"\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"\n"
+"\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"\n"
+"#define make_float4 (float4)\n"
+"#define make_float2 (float2)\n"
+"#define make_uint4 (uint4)\n"
+"#define make_int4 (int4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"\n"
+"\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"\n"
+"\n"
+"#define WG_SIZE 64\n"
+"\n"
+"\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	float4 m_worldPos[4];\n"
+"	float4 m_worldNormal;\n"
+"	u32 m_coeffs;\n"
+"	int m_batchIdx;\n"
+"\n"
+"	u32 m_bodyA;\n"
+"	u32 m_bodyB;\n"
+"}Contact4;\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	int m_n;\n"
+"	int m_start;\n"
+"	int m_staticIdx;\n"
+"	int m_paddings[1];\n"
+"} ConstBuffer;\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	u32 m_a;\n"
+"	u32 m_b;\n"
+"	u32 m_idx;\n"
+"}Elem;\n"
+"\n"
+"#define STACK_SIZE (WG_SIZE*10)\n"
+"//#define STACK_SIZE (WG_SIZE)\n"
+"#define RING_SIZE 1024\n"
+"#define RING_SIZE_MASK (RING_SIZE-1)\n"
+"#define CHECK_SIZE (WG_SIZE)\n"
+"\n"
+"\n"
+"#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n"
+"#define RING_END ldsTmp\n"
+"\n"
+"u32 readBuf(__local u32* buff, int idx)\n"
+"{\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"	return buff[bufIdx] & (1<<bitIdx);\n"
+"}\n"
+"\n"
+"void writeBuf(__local u32* buff, int idx)\n"
+"{\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"//	buff[bufIdx] |= (1<<bitIdx);\n"
+"	atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+"}\n"
+"\n"
+"u32 tryWrite(__local u32* buff, int idx)\n"
+"{\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+"	return ((ans >> bitIdx)&1) == 0;\n"
+"}\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	int m_valInt0;\n"
+"	int m_valInt1;\n"
+"	int m_valInt2;\n"
+"	int m_valInt3;\n"
+"\n"
+"	int m_valInt4;\n"
+"	int m_valInt5;\n"
+"	int m_valInt6;\n"
+"	int m_valInt7;\n"
+"\n"
+"	int m_valInt8;\n"
+"	int m_valInt9;\n"
+"	int m_valInt10;\n"
+"	int m_valInt11;\n"
+"	\n"
+"	int	m_valInt12;\n"
+"	int	m_valInt13;\n"
+"	int	m_valInt14;\n"
+"	int	m_valInt15;\n"
+"\n"
+"\n"
+"	float m_fval0;\n"
+"	float m_fval1;\n"
+"	float m_fval2;\n"
+"	float m_fval3;\n"
+"} SolverDebugInfo;\n"
+"\n"
+"//	batching on the GPU\n"
+"__kernel void CreateBatches( __global Contact4* gConstraints, __global Contact4* gConstraintsOut, //__global u32* gRes, \n"
+"		__global u32* gN, __global u32* gStart, \n"
+"//		__global SolverDebugInfo* debugInfo, \n"
+"		ConstBuffer cb )\n"
+"{\n"
+"	__local u32 ldsStackIdx[STACK_SIZE];\n"
+"	__local u32 ldsStackEnd;\n"
+"	__local Elem ldsRingElem[RING_SIZE];\n"
+"	__local u32 ldsRingEnd;\n"
+"	__local u32 ldsTmp;\n"
+"	__local u32 ldsCheckBuffer[CHECK_SIZE];\n"
+"	__local u32 ldsFixedBuffer[CHECK_SIZE];\n"
+"	__local u32 ldsGEnd;\n"
+"	__local u32 ldsDstEnd;\n"
+"\n"
+"	int wgIdx = GET_GROUP_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	\n"
+"	const int m_n = gN[wgIdx];\n"
+"	const int m_start = gStart[wgIdx];\n"
+"	const int m_staticIdx = cb.m_staticIdx;\n"
+"		\n"
+"	if( lIdx == 0 )\n"
+"	{\n"
+"		ldsRingEnd = 0;\n"
+"		ldsGEnd = 0;\n"
+"		ldsStackEnd = 0;\n"
+"		ldsDstEnd = m_start;\n"
+"	}\n"
+"	\n"
+"//	while(1)\n"
+"	for(int ie=0; ie<250; ie++)\n"
+"	{\n"
+"		ldsFixedBuffer[lIdx] = 0;\n"
+"\n"
+"		for(int giter=0; giter<4; giter++)\n"
+"		{\n"
+"			int ringCap = GET_RING_CAPACITY;\n"
+"		\n"
+"			//	1. fill ring\n"
+"			if( ldsGEnd < m_n )\n"
+"			{\n"
+"				while( ringCap > WG_SIZE )\n"
+"				{\n"
+"					if( ldsGEnd >= m_n ) break;\n"
+"					if( lIdx < ringCap - WG_SIZE )\n"
+"					{\n"
+"						int srcIdx;\n"
+"						AtomInc1( ldsGEnd, srcIdx );\n"
+"						if( srcIdx < m_n )\n"
+"						{\n"
+"							int dstIdx;\n"
+"							AtomInc1( ldsRingEnd, dstIdx );\n"
+"							\n"
+"							int a = gConstraints[m_start+srcIdx].m_bodyA;\n"
+"							int b = gConstraints[m_start+srcIdx].m_bodyB;\n"
+"							ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n"
+"							ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n"
+"							ldsRingElem[dstIdx].m_idx = srcIdx;\n"
+"						}\n"
+"					}\n"
+"					ringCap = GET_RING_CAPACITY;\n"
+"				}\n"
+"			}\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"	\n"
+"			//	2. fill stack\n"
+"			__local Elem* dst = ldsRingElem;\n"
+"			if( lIdx == 0 ) RING_END = 0;\n"
+"\n"
+"			int srcIdx=lIdx;\n"
+"			int end = ldsRingEnd;\n"
+"\n"
+"			{\n"
+"				for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n"
+"				{\n"
+"					Elem e;\n"
+"					if(srcIdx<end) e = ldsRingElem[srcIdx];\n"
+"					bool done = (srcIdx<end)?false:true;\n"
+"\n"
+"					for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n"
+"					\n"
+"					if( !done )\n"
+"					{\n"
+"						int aUsed = readBuf( ldsFixedBuffer, e.m_a);\n"
+"						int bUsed = readBuf( ldsFixedBuffer, e.m_b);\n"
+"\n"
+"						if( aUsed==0 && bUsed==0 )\n"
+"						{\n"
+"							int aAvailable;\n"
+"							int bAvailable;\n"
+"\n"
+"							aAvailable = tryWrite( ldsCheckBuffer, e.m_a );\n"
+"							bAvailable = tryWrite( ldsCheckBuffer, e.m_b );\n"
+"\n"
+"							//aAvailable = (m_staticIdx == e.m_a)? 1: aAvailable;\n"
+"							//bAvailable = (m_staticIdx == e.m_b)? 1: bAvailable;\n"
+"\n"
+"							bool success = (aAvailable && bAvailable);\n"
+"							if(success)\n"
+"							{\n"
+"								writeBuf( ldsFixedBuffer, e.m_a );\n"
+"								writeBuf( ldsFixedBuffer, e.m_b );\n"
+"							}\n"
+"							done = success;\n"
+"						}\n"
+"					}\n"
+"\n"
+"					//	put it aside\n"
+"					if(srcIdx<end)\n"
+"					{\n"
+"						if( done )\n"
+"						{\n"
+"							int dstIdx; AtomInc1( ldsStackEnd, dstIdx );\n"
+"							if( dstIdx < STACK_SIZE )\n"
+"								ldsStackIdx[dstIdx] = e.m_idx;\n"
+"							else{\n"
+"								done = false;\n"
+"								AtomAdd( ldsStackEnd, -1 );\n"
+"							}\n"
+"						}\n"
+"						if( !done )\n"
+"						{\n"
+"							int dstIdx; AtomInc1( RING_END, dstIdx );\n"
+"							dst[dstIdx] = e;\n"
+"						}\n"
+"					}\n"
+"\n"
+"					//	if filled, flush\n"
+"					if( ldsStackEnd == STACK_SIZE )\n"
+"					{\n"
+"						for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)\n"
+"						{\n"
+"							int idx = m_start + ldsStackIdx[i];\n"
+"							int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+"							gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+"							gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
+"						}\n"
+"						if( lIdx == 0 ) ldsStackEnd = 0;\n"
+"\n"
+"						//for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n"
+"						ldsFixedBuffer[lIdx] = 0;\n"
+"					}\n"
+"				}\n"
+"			}\n"
+"\n"
+"			if( lIdx == 0 ) ldsRingEnd = RING_END;\n"
+"		}\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n"
+"		{\n"
+"			int idx = m_start + ldsStackIdx[i];\n"
+"			int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+"			gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+"			gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
+"		}\n"
+"\n"
+"		//	in case it couldn't consume any pair. Flush them\n"
+"		//	todo. Serial batch worth while?\n"
+"		if( ldsStackEnd == 0 )\n"
+"		{\n"
+"			for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)\n"
+"			{\n"
+"				int idx = m_start + ldsRingElem[i].m_idx;\n"
+"				int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+"				gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+"				gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;\n"
+"			}\n"
+"			GROUP_LDS_BARRIER;\n"
+"			if( lIdx == 0 ) ldsRingEnd = 0;\n"
+"		}\n"
+"\n"
+"		if( lIdx == 0 ) ldsStackEnd = 0;\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		//	termination\n"
+"		if( ldsGEnd == m_n && ldsRingEnd == 0 )\n"
+"			break;\n"
+"	}\n"
+"\n"
+"\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringify.py
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringify.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+import sys
+import os
+import shutil
+
+arg = sys.argv[1]
+fh = open(arg)
+	
+print 'static const char* '+sys.argv[2]+'= \\'
+for line in fh.readlines():
+	a = line.strip('\n')
+	print '"'+a+'\\n"'
+print ';'
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernels.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernels.bat
@@ -0,0 +1,6 @@
+stringify.py ChNarrowphaseKernels.cl narrowphaseKernelsCL >ChNarrowphaseKernels.h
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsAll.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsAll.bat
@@ -0,0 +1,10 @@
+stringify.py ChNarrowphaseKernels.cl narrowphaseKernelsCL >ChNarrowphaseKernels.h
+stringify.py SolverKernels.cl solverKernelsCL >SolverKernels.h
+stringify.py batchingKernels.cl batchingKernelsCL >batchingKernels.h
+
+
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsBatching.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsBatching.bat
@@ -0,0 +1,8 @@
+stringify.py batchingKernels.cl batchingKernelsCL >batchingKernels.h
+
+
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsNarrowphase.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsNarrowphase.bat
@@ -0,0 +1,8 @@
+stringify.py ChNarrowphaseKernels.cl narrowphaseKernelsCL >ChNarrowphaseKernels.h
+
+
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsSolver.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsSolver.bat
@@ -0,0 +1,8 @@
+stringify.py SolverKernels.cl solverKernelsCL >SolverKernels.h
+
+
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause