diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/ComputeBounds.hlsl b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/ComputeBounds.hlsl
new file mode 100644
index 000000000..e21f959cc
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/ComputeBounds.hlsl
@@ -0,0 +1,83 @@
+MSTRINGIFY(
+
+cbuffer ComputeBoundsCB : register( b0 )
+{
+	int numNodes;
+	int numSoftBodies;
+	int padding1;
+	int padding2;
+};
+
+// Node indices for each link
+StructuredBuffer<int> g_vertexClothIdentifier : register( t0 );
+StructuredBuffer<float4> g_vertexPositions : register( t1 );
+
+RWStructuredBuffer<uint4> g_clothMinBounds : register( u0 );
+RWStructuredBuffer<uint4> g_clothMaxBounds : register( u1 );
+
+groupshared uint4 clothMinBounds[256];
+groupshared uint4 clothMaxBounds[256];
+
+[numthreads(128, 1, 1)]
+void 
+ComputeBoundsKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	const unsigned int UINT_MAX = 0xffffffff;
+
+	// Init min and max bounds arrays
+	if( GTid.x < numSoftBodies )
+	{
+		clothMinBounds[GTid.x] = uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
+		clothMaxBounds[GTid.x] = uint4(0,0,0,0);
+	}
+
+	AllMemoryBarrierWithGroupSync();
+
+	int nodeID = DTid.x;
+	if( nodeID < numNodes )
+	{	
+		int clothIdentifier = g_vertexClothIdentifier[nodeID];
+		if( clothIdentifier >= 0 )
+		{
+			float3 position = g_vertexPositions[nodeID].xyz;
+
+			// Reinterpret position as uint
+			uint3 positionUInt = uint3(asuint(position.x), asuint(position.y), asuint(position.z));
+		
+			// Invert sign bit of positives and whole of negatives to allow comparison as unsigned ints
+			//positionUInt.x ^= uint((-int(positionUInt.x >> 31) | 0x80000000));
+			//positionUInt.y ^= uint((-int(positionUInt.y >> 31) | 0x80000000));
+			//positionUInt.z ^= uint((-int(positionUInt.z >> 31) | 0x80000000));
+			positionUInt.x ^= (1+~(positionUInt.x >> 31) | 0x80000000);
+			positionUInt.y ^= (1+~(positionUInt.y >> 31) | 0x80000000);		
+			positionUInt.z ^= (1+~(positionUInt.z >> 31) | 0x80000000);
+		
+			// Min/max with the LDS values
+			InterlockedMin(clothMinBounds[clothIdentifier].x, positionUInt.x);
+			InterlockedMin(clothMinBounds[clothIdentifier].y, positionUInt.y);
+			InterlockedMin(clothMinBounds[clothIdentifier].z, positionUInt.z);
+
+			InterlockedMax(clothMaxBounds[clothIdentifier].x, positionUInt.x);
+			InterlockedMax(clothMaxBounds[clothIdentifier].y, positionUInt.y);
+			InterlockedMax(clothMaxBounds[clothIdentifier].z, positionUInt.z);
+		}
+	}
+	
+	AllMemoryBarrierWithGroupSync();
+
+
+	// Use global atomics to update the global versions of the data
+	if( GTid.x < numSoftBodies )
+	{
+		InterlockedMin(g_clothMinBounds[GTid.x].x, clothMinBounds[GTid.x].x);
+		InterlockedMin(g_clothMinBounds[GTid.x].y, clothMinBounds[GTid.x].y);
+		InterlockedMin(g_clothMinBounds[GTid.x].z, clothMinBounds[GTid.x].z);
+
+		InterlockedMax(g_clothMaxBounds[GTid.x].x, clothMaxBounds[GTid.x].x);		
+		InterlockedMax(g_clothMaxBounds[GTid.x].y, clothMaxBounds[GTid.x].y);
+		InterlockedMax(g_clothMaxBounds[GTid.x].z, clothMaxBounds[GTid.x].z);
+	}
+}
+
+
+);
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocities.hlsl b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocities.hlsl
new file mode 100644
index 000000000..399912f49
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocities.hlsl
@@ -0,0 +1,170 @@
+MSTRINGIFY(
+
+cbuffer SolvePositionsFromLinksKernelCB : register( b0 )
+{
+	unsigned int numNodes;
+	float isolverdt;
+	int padding0;
+	int padding1;
+};
+
+struct CollisionObjectIndices
+{
+	int firstObject;
+	int endObject;
+};
+
+struct CollisionShapeDescription
+{
+	float4x4 shapeTransform;
+	float4 linearVelocity;
+	float4 angularVelocity;
+
+	int softBodyIdentifier;
+	int collisionShapeType;
+	
+
+	// Shape information
+	// Compressed from the union
+	float radius;
+	float halfHeight;
+		
+	float margin;
+	float friction;
+
+	int padding0;
+	int padding1;
+	
+};
+
+// From btBroadphaseProxy.h
+static const int CAPSULE_SHAPE_PROXYTYPE = 10;
+
+// Node indices for each link
+StructuredBuffer<int> g_vertexClothIdentifier : register( t0 );
+StructuredBuffer<float4> g_vertexPreviousPositions : register( t1 );
+StructuredBuffer<float> g_perClothFriction : register( t2 );
+StructuredBuffer<float> g_clothDampingFactor : register( t3 );
+StructuredBuffer<CollisionObjectIndices> g_perClothCollisionObjectIndices : register( t4 );
+StructuredBuffer<CollisionShapeDescription> g_collisionObjectDetails : register( t5 );
+
+RWStructuredBuffer<float4> g_vertexForces : register( u0 );
+RWStructuredBuffer<float4> g_vertexVelocities : register( u1 );
+RWStructuredBuffer<float4> g_vertexPositions : register( u2 );
+
+[numthreads(128, 1, 1)]
+void 
+SolveCollisionsAndUpdateVelocitiesKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int nodeID = DTid.x;
+	float3 forceOnVertex = float3(0.f, 0.f, 0.f);
+	if( DTid.x < numNodes )
+	{	
+		int clothIdentifier = g_vertexClothIdentifier[nodeID];
+		float4 position = float4(g_vertexPositions[nodeID].xyz, 1.f);
+		float4 previousPosition = float4(g_vertexPreviousPositions[nodeID].xyz, 1.f);
+		float3 velocity;
+		float clothFriction = g_perClothFriction[clothIdentifier];
+		float dampingFactor = g_clothDampingFactor[clothIdentifier];
+		float velocityCoefficient = (1.f - dampingFactor);		
+		CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
+	
+		if( collisionObjectIndices.firstObject != collisionObjectIndices.endObject )
+		{
+			velocity = float3(15, 0, 0);
+
+			// We have some possible collisions to deal with
+			for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision )
+			{
+				CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision];
+				float colliderFriction = shapeDescription.friction;
+		
+				if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
+				{
+					// Colliding with a capsule
+
+					float capsuleHalfHeight = shapeDescription.halfHeight;
+					float capsuleRadius = shapeDescription.radius;
+					float capsuleMargin = shapeDescription.margin;
+					float4x4 worldTransform = shapeDescription.shapeTransform;
+
+					float4 c1 = float4(0.f, -capsuleHalfHeight, 0.f, 1.f); 
+					float4 c2 = float4(0.f, +capsuleHalfHeight, 0.f, 1.f);
+					float4 worldC1 = mul(worldTransform, c1);
+					float4 worldC2 = mul(worldTransform, c2);
+					float3 segment = (worldC2 - worldC1).xyz;
+
+					// compute distance of tangent to vertex along line segment in capsule
+					float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) );
+
+					float4 closestPoint = (worldC1 + float4(segment * distanceAlongSegment, 0.f));
+					float distanceFromLine = length(position - closestPoint);
+					float distanceFromC1 = length(worldC1 - position);
+					float distanceFromC2 = length(worldC2 - position);
+					
+					// Final distance from collision, point to push from, direction to push in
+					// for impulse force
+					float dist;
+					float3 normalVector;
+					if( distanceAlongSegment < 0 )
+					{
+						dist = distanceFromC1;
+						normalVector = normalize(position - worldC1).xyz;
+					} else if( distanceAlongSegment > 1.f ) {
+						dist = distanceFromC2;
+						normalVector = normalize(position - worldC2).xyz;	
+					} else {
+						dist = distanceFromLine;
+						normalVector = normalize(position - closestPoint).xyz;
+					}
+						
+					float3 colliderLinearVelocity = shapeDescription.linearVelocity.xyz;
+					float3 colliderAngularVelocity = shapeDescription.angularVelocity.xyz;
+					float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz - worldTransform._m03_m13_m23);
+
+					float minDistance = capsuleRadius + capsuleMargin;
+					
+					// In case of no collision, this is the value of velocity
+					velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
+					
+					
+					// Check for a collision
+					if( dist < minDistance )
+					{
+						// Project back to surface along normal
+						position = position + float4((minDistance - dist)*normalVector*0.9, 0.f);
+						velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
+						float3 relativeVelocity = velocity - velocityOfSurfacePoint;
+
+						float3 p1 = normalize(cross(normalVector, segment));
+						float3 p2 = normalize(cross(p1, normalVector));
+						// Full friction is sum of velocities in each direction of plane
+						float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2);
+
+						// Real friction is peak friction corrected by friction coefficients
+						frictionVector = frictionVector * (colliderFriction*clothFriction);
+
+						float approachSpeed = dot(relativeVelocity, normalVector);
+
+						if( approachSpeed <= 0.0 )
+							forceOnVertex -= frictionVector;
+					}
+					
+				}
+			}
+		} else {
+			// Update velocity	
+			float3 difference = position.xyz - previousPosition.xyz;
+			velocity = difference*velocityCoefficient*isolverdt;			
+		}
+
+		g_vertexVelocities[nodeID] = float4(velocity, 0.f);	
+
+		// Update external force
+		g_vertexForces[nodeID] = float4(forceOnVertex, 0.f);
+
+		g_vertexPositions[nodeID] = float4(position.xyz, 0.f);
+	}
+}
+
+);
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocitiesSIMDBatched.hlsl b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocitiesSIMDBatched.hlsl
new file mode 100644
index 000000000..9bb7e4da1
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocitiesSIMDBatched.hlsl
@@ -0,0 +1,191 @@
+MSTRINGIFY(
+
+cbuffer SolvePositionsFromLinksKernelCB : register( b0 )
+{
+	unsigned int numNodes;
+	float isolverdt;
+	int padding0;
+	int padding1;
+};
+
+struct CollisionObjectIndices
+{
+	int firstObject;
+	int endObject;
+};
+
+struct CollisionShapeDescription
+{
+	float4x4 shapeTransform;
+	float4 linearVelocity;
+	float4 angularVelocity;
+
+	int softBodyIdentifier;
+	int collisionShapeType;
+	
+
+	// Shape information
+	// Compressed from the union
+	float radius;
+	float halfHeight;
+		
+	float margin;
+	float friction;
+
+	int padding0;
+	int padding1;
+	
+};
+
+// From btBroadphaseProxy.h
+static const int CAPSULE_SHAPE_PROXYTYPE = 10;
+
+// Node indices for each link
+StructuredBuffer<int> g_vertexClothIdentifier : register( t0 );
+StructuredBuffer<float4> g_vertexPreviousPositions : register( t1 );
+StructuredBuffer<float> g_perClothFriction : register( t2 );
+StructuredBuffer<float> g_clothDampingFactor : register( t3 );
+StructuredBuffer<CollisionObjectIndices> g_perClothCollisionObjectIndices : register( t4 );
+StructuredBuffer<CollisionShapeDescription> g_collisionObjectDetails : register( t5 );
+
+RWStructuredBuffer<float4> g_vertexForces : register( u0 );
+RWStructuredBuffer<float4> g_vertexVelocities : register( u1 );
+RWStructuredBuffer<float4> g_vertexPositions : register( u2 );
+
+// A buffer of local collision shapes
+// TODO: Iterate to support more than 16
+groupshared CollisionShapeDescription localCollisionShapes[16];
+
+[numthreads(128, 1, 1)]
+void 
+SolveCollisionsAndUpdateVelocitiesKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int nodeID = DTid.x;
+	float3 forceOnVertex = float3(0.f, 0.f, 0.f);
+
+	int clothIdentifier = g_vertexClothIdentifier[nodeID];
+	float4 position = float4(g_vertexPositions[nodeID].xyz, 1.f);
+	float4 previousPosition = float4(g_vertexPreviousPositions[nodeID].xyz, 1.f);
+	float3 velocity;
+	float clothFriction = g_perClothFriction[clothIdentifier];
+	float dampingFactor = g_clothDampingFactor[clothIdentifier];
+	float velocityCoefficient = (1.f - dampingFactor);		
+	CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
+	
+	int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
+	if( numObjects > 0 )
+	{
+		// We have some possible collisions to deal with
+		
+		// First load all of the collision objects into LDS
+		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
+		if( GTid.x < numObjects )
+		{
+			localCollisionShapes[GTid.x] = g_collisionObjectDetails[ collisionObjectIndices.firstObject + GTid.x ];
+		}
+	}
+
+	// Safe as the vertices are padded so that not more than one soft body is in a group
+	AllMemoryBarrierWithGroupSync();
+
+	// Annoyingly, even though I know the flow control is not varying, the compiler will not let me skip this
+	if( numObjects > 0 )
+	{
+		velocity = float3(0, 0, 0);
+		
+		
+		// We have some possible collisions to deal with
+		for( int collision = 0; collision < numObjects; ++collision )
+		{
+			CollisionShapeDescription shapeDescription = localCollisionShapes[collision];
+			float colliderFriction = shapeDescription.friction;
+		
+			if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
+			{
+				// Colliding with a capsule
+
+				float capsuleHalfHeight = localCollisionShapes[collision].halfHeight;
+				float capsuleRadius = localCollisionShapes[collision].radius;
+				float capsuleMargin = localCollisionShapes[collision].margin;
+
+				float4x4 worldTransform = localCollisionShapes[collision].shapeTransform;
+
+				float4 c1 = float4(0.f, -capsuleHalfHeight, 0.f, 1.f); 
+				float4 c2 = float4(0.f, +capsuleHalfHeight, 0.f, 1.f);
+				float4 worldC1 = mul(worldTransform, c1);
+				float4 worldC2 = mul(worldTransform, c2);
+				float3 segment = (worldC2 - worldC1).xyz;
+
+				// compute distance of tangent to vertex along line segment in capsule
+				float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) );
+
+				float4 closestPoint = (worldC1 + float4(segment * distanceAlongSegment, 0.f));
+				float distanceFromLine = length(position - closestPoint);
+				float distanceFromC1 = length(worldC1 - position);
+				float distanceFromC2 = length(worldC2 - position);
+					
+				// Final distance from collision, point to push from, direction to push in
+				// for impulse force
+				float dist;
+				float3 normalVector;
+				if( distanceAlongSegment < 0 )
+				{
+					dist = distanceFromC1;
+					normalVector = normalize(position - worldC1).xyz;
+				} else if( distanceAlongSegment > 1.f ) {
+					dist = distanceFromC2;
+					normalVector = normalize(position - worldC2).xyz;	
+				} else {
+					dist = distanceFromLine;
+					normalVector = normalize(position - closestPoint).xyz;
+				}
+						
+				float3 colliderLinearVelocity = localCollisionShapes[collision].linearVelocity.xyz;
+				float3 colliderAngularVelocity = localCollisionShapes[collision].angularVelocity.xyz;
+				float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz - worldTransform._m03_m13_m23);
+
+				float minDistance = capsuleRadius + capsuleMargin;
+					
+				// In case of no collision, this is the value of velocity
+				velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
+					
+					
+				// Check for a collision
+				if( dist < minDistance )
+				{
+					// Project back to surface along normal
+					position = position + float4((minDistance - dist)*normalVector*0.9, 0.f);
+					velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
+					float3 relativeVelocity = velocity - velocityOfSurfacePoint;
+
+					float3 p1 = normalize(cross(normalVector, segment));
+					float3 p2 = normalize(cross(p1, normalVector));
+					// Full friction is sum of velocities in each direction of plane
+					float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2);
+
+					// Real friction is peak friction corrected by friction coefficients
+					frictionVector = frictionVector * (colliderFriction*clothFriction);
+
+					float approachSpeed = dot(relativeVelocity, normalVector);
+
+					if( approachSpeed <= 0.0 )
+						forceOnVertex -= frictionVector;
+				}
+					
+			}
+		}
+	} else {
+		// Update velocity	
+		float3 difference = position.xyz - previousPosition.xyz;
+		velocity = difference*velocityCoefficient*isolverdt;			
+	}
+
+	g_vertexVelocities[nodeID] = float4(velocity, 0.f);	
+
+	// Update external force
+	g_vertexForces[nodeID] = float4(forceOnVertex, 0.f);
+
+	g_vertexPositions[nodeID] = float4(position.xyz, 0.f);
+}
+
+);
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ComputeBounds.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ComputeBounds.cl
new file mode 100644
index 000000000..15c0cdc67
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ComputeBounds.cl
@@ -0,0 +1,80 @@
+MSTRINGIFY(
+#pragma OPENCL EXTENSION cl_amd_printf : enable \n
+
+
+
+__kernel void
+ComputeBoundsKernel( 
+	int numNodes,
+	int numSoftBodies,
+	__global int * g_vertexClothIdentifier,
+	__global float4 * g_vertexPositions,
+	volatile __global uint * g_clothMinBounds,
+	volatile __global uint * g_clothMaxBounds,
+	volatile __local uint * clothMinBounds,
+	volatile __local uint * clothMaxBounds)
+{
+	// Init min and max bounds arrays
+	if( get_local_id(0) < numSoftBodies )
+	{
+		
+		clothMinBounds[get_local_id(0)*4] = UINT_MAX;
+		clothMinBounds[get_local_id(0)*4+1] = UINT_MAX;
+		clothMinBounds[get_local_id(0)*4+2] = UINT_MAX;
+		clothMinBounds[get_local_id(0)*4+3] = UINT_MAX;
+		clothMaxBounds[get_local_id(0)*4] = 0;
+		clothMaxBounds[get_local_id(0)*4+1] = 0;
+		clothMaxBounds[get_local_id(0)*4+2] = 0;
+		clothMaxBounds[get_local_id(0)*4+3] = 0;
+
+	}
+
+
+	barrier(CLK_GLOBAL_MEM_FENCE);
+
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{	
+		int clothIdentifier = g_vertexClothIdentifier[get_global_id(0)];
+		if( clothIdentifier >= 0 )
+		{
+			float3 position = g_vertexPositions[get_global_id(0)].xyz;
+
+			/* Reinterpret position as uint */
+			uint3 positionUInt = (uint3)(as_uint(position.x), as_uint(position.y), as_uint(position.z));
+		
+			/* Invert sign bit of positives and whole of negatives to allow comparison as unsigned ints */
+			positionUInt.x ^= (1+~(positionUInt.x >> 31) | 0x80000000);
+			positionUInt.y ^= (1+~(positionUInt.y >> 31) | 0x80000000);		
+			positionUInt.z ^= (1+~(positionUInt.z >> 31) | 0x80000000);
+		
+			/* Min/max with the LDS values */
+			atomic_min(&(clothMinBounds[clothIdentifier*4]), positionUInt.x);
+			atomic_min(&(clothMinBounds[clothIdentifier*4+1]), positionUInt.y);
+			atomic_min(&(clothMinBounds[clothIdentifier*4+2]), positionUInt.z);
+
+			atomic_max(&(clothMaxBounds[clothIdentifier*4]), positionUInt.x);
+			atomic_max(&(clothMaxBounds[clothIdentifier*4+1]), positionUInt.y);
+			atomic_max(&(clothMaxBounds[clothIdentifier*4+2]), positionUInt.z);
+		}
+	}
+	
+	barrier(CLK_GLOBAL_MEM_FENCE);
+
+
+	/* Use global atomics to update the global versions of the data*/
+	if( get_local_id(0) < numSoftBodies )
+	{
+		/*atomic_min(&(g_clothMinBounds[get_local_id(0)].x), clothMinBounds[get_local_id(0)].x);*/
+		atomic_min(&(g_clothMinBounds[get_local_id(0)*4]), clothMinBounds[get_local_id(0)*4]);
+		atomic_min(&(g_clothMinBounds[get_local_id(0)*4+1]), clothMinBounds[get_local_id(0)*4+1]);
+		atomic_min(&(g_clothMinBounds[get_local_id(0)*4+2]), clothMinBounds[get_local_id(0)*4+2]);
+
+		atomic_max(&(g_clothMaxBounds[get_local_id(0)*4]), clothMaxBounds[get_local_id(0)*4]);		
+		atomic_max(&(g_clothMaxBounds[get_local_id(0)*4+1]), clothMaxBounds[get_local_id(0)*4+1]);
+		atomic_max(&(g_clothMaxBounds[get_local_id(0)*4+2]), clothMaxBounds[get_local_id(0)*4+2]);
+	}
+}
+
+
+);
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/OutputToVertexArray.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/OutputToVertexArray.cl
new file mode 100644
index 000000000..4bc614c09
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/OutputToVertexArray.cl
@@ -0,0 +1,57 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+cbuffer OutputToVertexArrayCB : register( b0 )
+{
+	int startNode;
+	int numNodes;
+	int offsetX;
+	int strideX;
+	
+	int offsetN;	
+	int strideN;
+	int padding1;
+	int padding2;
+};
+
+
+StructuredBuffer<float4> g_nodesx : register( t0 );
+StructuredBuffer<float4> g_nodesn : register( t1 );
+
+RWStructuredBuffer<float> g_vertexBuffer : register( u0 );
+
+
+[numthreads(128, 1, 1)]
+void 
+OutputToVertexArrayKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	int nodeID = DTid.x;
+	if( nodeID < numNodes )
+	{			
+		float4 nodeX = g_nodesx[nodeID + startNode];
+		float4 nodeN = g_nodesn[nodeID + startNode];
+		
+		// Stride should account for the float->float4 conversion
+		int positionDestination = nodeID * strideX + offsetX;		
+		g_vertexBuffer[positionDestination] = nodeX.x;
+		g_vertexBuffer[positionDestination+1] = nodeX.y;
+		g_vertexBuffer[positionDestination+2] = nodeX.z;
+		
+		int normalDestination = nodeID * strideN + offsetN;
+		g_vertexBuffer[normalDestination] = nodeN.x;
+		g_vertexBuffer[normalDestination+1] = nodeN.y;
+		g_vertexBuffer[normalDestination+2] = nodeN.z;		
+	}
+}
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositionsSIMDBatched.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositionsSIMDBatched.cl
new file mode 100644
index 000000000..28255300e
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositionsSIMDBatched.cl
@@ -0,0 +1,139 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+MSTRINGIFY(
+
+float mydot3(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+__kernel void 
+SolvePositionsFromLinksKernel( 
+	const int startWaveInBatch,
+	const int numWaves,
+	const float kst,
+	const float ti,
+	__global int2 *g_wavefrontBatchCountsVertexCounts,
+	__global int *g_vertexAddressesPerWavefront,
+	__global int2 * g_linksVertexIndices,
+	__global float * g_linksMassLSC,
+	__global float * g_linksRestLengthSquared,
+	__global float * g_verticesInverseMass,
+	__global float4 * g_vertexPositions,
+	__local int2 *wavefrontBatchCountsVertexCounts,
+	__local float4 *vertexPositionSharedData,
+	__local float *vertexInverseMassSharedData)
+{
+	const int laneInWavefront = (get_global_id(0) & (WAVEFRONT_SIZE-1));
+	const int wavefront = startWaveInBatch + (get_global_id(0) / WAVEFRONT_SIZE);
+	const int firstWavefrontInBlock = startWaveInBatch + get_group_id(0) * WAVEFRONT_BLOCK_MULTIPLIER;
+	const int localWavefront = wavefront - firstWavefrontInBlock;
+
+	// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier	
+	if( wavefront < (startWaveInBatch + numWaves) )
+	{	
+		// Load the batch counts for the wavefronts
+		// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier
+		if( laneInWavefront == 0 )
+		{
+			int2 batchesAndVertexCountsWithinWavefront = g_wavefrontBatchCountsVertexCounts[wavefront];
+			wavefrontBatchCountsVertexCounts[localWavefront] = batchesAndVertexCountsWithinWavefront;
+		}
+
+		
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+		
+
+		int2 batchesAndVerticesWithinWavefront = wavefrontBatchCountsVertexCounts[localWavefront];
+		int batchesWithinWavefront = batchesAndVerticesWithinWavefront.x;
+		int verticesUsedByWave = batchesAndVerticesWithinWavefront.y;
+
+		// Load the vertices for the wavefronts
+		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
+		{
+			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+
+			vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_vertexPositions[vertexAddress];
+			vertexInverseMassSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_verticesInverseMass[vertexAddress];
+		}
+		
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+
+		// Loop through the batches performing the solve on each in LDS
+		int baseDataLocationForWave = WAVEFRONT_SIZE * wavefront * MAX_BATCHES_PER_WAVE;	
+
+		//for( int batch = 0; batch < batchesWithinWavefront; ++batch )
+		
+		int batch = 0;
+		do
+		{
+			int baseDataLocation = baseDataLocationForWave + WAVEFRONT_SIZE * batch;
+			int locationOfValue = baseDataLocation + laneInWavefront;
+			
+			
+			// These loads should all be perfectly linear across the WF
+			int2 localVertexIndices = g_linksVertexIndices[locationOfValue];
+			float massLSC = g_linksMassLSC[locationOfValue];
+			float restLengthSquared = g_linksRestLengthSquared[locationOfValue];
+			
+			// LDS vertex addresses based on logical wavefront number in block and loaded index
+			int vertexAddress0 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.x;
+			int vertexAddress1 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.y;
+			
+			float4 position0 = vertexPositionSharedData[vertexAddress0];
+			float4 position1 = vertexPositionSharedData[vertexAddress1];
+
+			float inverseMass0 = vertexInverseMassSharedData[vertexAddress0];
+			float inverseMass1 = vertexInverseMassSharedData[vertexAddress1]; 
+
+			float4 del = position1 - position0;
+			float len = mydot3(del, del);
+			
+			float k = 0;
+			if( massLSC > 0.0f )
+			{		
+				k = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
+			}
+			
+			position0 = position0 - del*(k*inverseMass0);
+			position1 = position1 + del*(k*inverseMass1);
+			
+			// Ensure compiler does not re-order memory operations
+			mem_fence(CLK_LOCAL_MEM_FENCE);
+
+			vertexPositionSharedData[vertexAddress0] = position0;
+			vertexPositionSharedData[vertexAddress1] = position1;
+			
+			// Ensure compiler does not re-order memory operations
+			mem_fence(CLK_LOCAL_MEM_FENCE);
+				
+			
+			++batch;
+		} while( batch < batchesWithinWavefront );
+
+		// Update the global memory vertices for the wavefronts
+		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
+		{
+			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+
+			g_vertexPositions[vertexAddress] = (float4)(vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex].xyz, 0.f);
+		}		
+		
+	}
+
+}
+
+);
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocities.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocities.cl
new file mode 100644
index 000000000..9f50da8a4
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocities.cl
@@ -0,0 +1,195 @@
+MSTRINGIFY(
+
+typedef struct 
+{
+	int firstObject;
+	int endObject;
+} CollisionObjectIndices;
+
+typedef struct 
+{
+	float4 shapeTransform[4]; // column major 4x4 matrix
+	float4 linearVelocity;
+	float4 angularVelocity;
+
+	int softBodyIdentifier;
+	int collisionShapeType;
+	
+
+	// Shape information
+	// Compressed from the union
+	float radius;
+	float halfHeight;
+	int upAxis;
+		
+	float margin;
+	float friction;
+
+	int padding0;
+	
+} CollisionShapeDescription;
+
+/* From btBroadphaseProxy.h */
+__constant int CAPSULE_SHAPE_PROXYTYPE = 10;
+
+/* Multiply column-major matrix against vector */
+float4 matrixVectorMul( float4 matrix[4], float4 vector )
+{
+	float4 returnVector;
+	float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
+	float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
+	float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
+	float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
+	returnVector.x = dot(row0, vector);
+	returnVector.y = dot(row1, vector);
+	returnVector.z = dot(row2, vector);
+	returnVector.w = dot(row3, vector);
+	return returnVector;
+}
+
+__kernel void 
+SolveCollisionsAndUpdateVelocitiesKernel( 
+	const int numNodes,
+	const float isolverdt,
+	__global int *g_vertexClothIdentifier,
+	__global float4 *g_vertexPreviousPositions,
+	__global float * g_perClothFriction,
+	__global float * g_clothDampingFactor,
+	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
+	__global CollisionShapeDescription * g_collisionObjectDetails,
+	__global float4 * g_vertexForces,
+	__global float4 *g_vertexVelocities,
+	__global float4 *g_vertexPositions)
+{
+	int nodeID = get_global_id(0);
+	float3 forceOnVertex = (float3)(0.f, 0.f, 0.f);
+	if( get_global_id(0) < numNodes )
+	{	
+		int clothIdentifier = g_vertexClothIdentifier[nodeID];
+		
+		// Abort if this is not a valid cloth
+		if( clothIdentifier < 0 )
+			return;
+
+		float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f);
+		float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f);
+		float3 velocity;
+		float clothFriction = g_perClothFriction[clothIdentifier];
+		float dampingFactor = g_clothDampingFactor[clothIdentifier];
+		float velocityCoefficient = (1.f - dampingFactor);		
+		CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
+	
+		if( collisionObjectIndices.firstObject != collisionObjectIndices.endObject )
+		{
+			velocity = (float3)(15, 0, 0);
+
+			/* We have some possible collisions to deal with */
+			for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision )
+			{
+				CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision];
+				float colliderFriction = shapeDescription.friction;
+		
+				if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
+				{
+					/* Colliding with a capsule */
+
+					float capsuleHalfHeight = shapeDescription.halfHeight;
+					float capsuleRadius = shapeDescription.radius;
+					float capsuleMargin = shapeDescription.margin;
+					int capsuleupAxis = shapeDescription.upAxis;
+
+					/* Four columns of worldTransform matrix */
+					float4 worldTransform[4];
+					worldTransform[0] = shapeDescription.shapeTransform[0];
+					worldTransform[1] = shapeDescription.shapeTransform[1];
+					worldTransform[2] = shapeDescription.shapeTransform[2];
+					worldTransform[3] = shapeDescription.shapeTransform[3];
+
+					// Correctly define capsule centerline vector 
+					float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); 
+					float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f);
+					c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
+					c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
+					c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
+					c2.x = -c1.x;
+					c2.y = -c1.y;
+					c2.z = -c1.z;
+
+					float4 worldC1 = matrixVectorMul(worldTransform, c1);
+					float4 worldC2 = matrixVectorMul(worldTransform, c2);
+					float3 segment = (worldC2 - worldC1).xyz;
+
+					/* compute distance of tangent to vertex along line segment in capsule */
+					float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) );
+
+					float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment, 0.f));
+					float distanceFromLine = length(position - closestPoint);
+					float distanceFromC1 = length(worldC1 - position);
+					float distanceFromC2 = length(worldC2 - position);
+					
+					/* Final distance from collision, point to push from, direction to push in
+					   for impulse force */
+					float dist;
+					float3 normalVector;
+					if( distanceAlongSegment < 0 )
+					{
+						dist = distanceFromC1;
+						normalVector = normalize(position - worldC1).xyz;
+					} else if( distanceAlongSegment > 1.f ) {
+						dist = distanceFromC2;
+						normalVector = normalize(position - worldC2).xyz;	
+					} else {
+						dist = distanceFromLine;
+						normalVector = normalize(position - closestPoint).xyz;
+					}
+						
+					float3 colliderLinearVelocity = shapeDescription.linearVelocity.xyz;
+					float3 colliderAngularVelocity = shapeDescription.angularVelocity.xyz;
+					float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz - (float3)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w));
+
+					float minDistance = capsuleRadius + capsuleMargin;
+					
+					/* In case of no collision, this is the value of velocity */
+					velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
+					
+					
+					// Check for a collision
+					if( dist < minDistance )
+					{
+						/* Project back to surface along normal */
+						position = position + (float4)((minDistance - dist)*normalVector*0.9f, 0.f);
+						velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
+						float3 relativeVelocity = velocity - velocityOfSurfacePoint;
+
+						float3 p1 = normalize(cross(normalVector, segment));
+						float3 p2 = normalize(cross(p1, normalVector));
+						/* Full friction is sum of velocities in each direction of plane */
+						float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2);
+
+						/* Real friction is peak friction corrected by friction coefficients */
+						frictionVector = frictionVector * (colliderFriction*clothFriction);
+
+						float approachSpeed = dot(relativeVelocity, normalVector);
+
+						if( approachSpeed <= 0.0f )
+							forceOnVertex -= frictionVector;
+					}
+					
+				}
+			}
+		} else {
+			/* Update velocity	 */
+			float3 difference = position.xyz - previousPosition.xyz;
+			velocity = difference*velocityCoefficient*isolverdt;			
+		}
+
+		g_vertexVelocities[nodeID] = (float4)(velocity, 0.f);	
+
+		/* Update external force */
+		g_vertexForces[nodeID] = (float4)(forceOnVertex, 0.f);
+
+		g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f);
+	}
+}
+
+);
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl
new file mode 100644
index 000000000..5ab2a6208
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl
@@ -0,0 +1,213 @@
+MSTRINGIFY(
+
+typedef struct 
+{
+	int firstObject;
+	int endObject;
+} CollisionObjectIndices;
+
+typedef struct 
+{
+	float4 shapeTransform[4]; /* column major 4x4 matrix */
+	float4 linearVelocity;
+	float4 angularVelocity;
+
+	int softBodyIdentifier;
+	int collisionShapeType;
+	
+
+	// Shape information
+	// Compressed from the union
+	float radius;
+	float halfHeight;
+	int upAxis;
+		
+	float margin;
+	float friction;
+
+	int padding0;
+	
+} CollisionShapeDescription;
+
+/* From btBroadphaseProxy.h */
+__constant int CAPSULE_SHAPE_PROXYTYPE = 10;
+
+
+/* Multiply column-major matrix against vector */
+float4 matrixVectorMul( float4 matrix[4], float4 vector )
+{
+	float4 returnVector;
+	float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
+	float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
+	float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
+	float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
+	returnVector.x = dot(row0, vector);
+	returnVector.y = dot(row1, vector);
+	returnVector.z = dot(row2, vector);
+	returnVector.w = dot(row3, vector);
+	return returnVector;
+}
+
+__kernel void 
+SolveCollisionsAndUpdateVelocitiesKernel( 
+	const int numNodes,
+	const float isolverdt,
+	__global int *g_vertexClothIdentifier,
+	__global float4 *g_vertexPreviousPositions,
+	__global float * g_perClothFriction,
+	__global float * g_clothDampingFactor,
+	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
+	__global CollisionShapeDescription * g_collisionObjectDetails,
+	__global float4 * g_vertexForces,
+	__global float4 *g_vertexVelocities,
+	__global float4 *g_vertexPositions,
+	__local CollisionShapeDescription *localCollisionShapes)
+{
+	int nodeID = get_global_id(0);
+	float3 forceOnVertex = (float3)(0.f, 0.f, 0.f);
+
+	int clothIdentifier = g_vertexClothIdentifier[nodeID];
+	
+	// Abort if this is not a valid cloth
+	if( clothIdentifier < 0 )
+		return;
+
+	float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f);
+	float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f);
+	float3 velocity;
+	float clothFriction = g_perClothFriction[clothIdentifier];
+	float dampingFactor = g_clothDampingFactor[clothIdentifier];
+	float velocityCoefficient = (1.f - dampingFactor);		
+	CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
+	
+	int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
+	if( numObjects > 0 )
+	{
+		/* We have some possible collisions to deal with */
+		
+		/* First load all of the collision objects into LDS */
+		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
+		if( get_local_id(0) < numObjects )
+		{
+			localCollisionShapes[get_local_id(0)] = g_collisionObjectDetails[ collisionObjectIndices.firstObject + get_local_id(0) ];
+		}
+	}
+
+	/* Safe as the vertices are padded so that not more than one soft body is in a group */
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* Annoyingly, even though I know the flow control is not varying, the compiler will not let me skip this */
+	if( numObjects > 0 )
+	{
+		velocity = (float3)(0, 0, 0);
+		
+		
+		// We have some possible collisions to deal with
+		for( int collision = 0; collision < numObjects; ++collision )
+		{
+			CollisionShapeDescription shapeDescription = localCollisionShapes[collision];
+			float colliderFriction = shapeDescription.friction;
+		
+			if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
+			{
+				/* Colliding with a capsule */
+
+				float capsuleHalfHeight = localCollisionShapes[collision].halfHeight;
+				float capsuleRadius = localCollisionShapes[collision].radius;
+				float capsuleMargin = localCollisionShapes[collision].margin;
+				int capsuleupAxis = localCollisionShapes[collision].upAxis;
+
+				float4 worldTransform[4];
+				worldTransform[0] = localCollisionShapes[collision].shapeTransform[0];
+				worldTransform[1] = localCollisionShapes[collision].shapeTransform[1];
+				worldTransform[2] = localCollisionShapes[collision].shapeTransform[2];
+				worldTransform[3] = localCollisionShapes[collision].shapeTransform[3];
+
+				// Correctly define capsule centerline vector 
+				float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); 
+				float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f);
+				c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
+				c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
+				c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
+				c2.x = -c1.x;
+				c2.y = -c1.y;
+				c2.z = -c1.z;
+
+				float4 worldC1 = matrixVectorMul(worldTransform, c1);
+				float4 worldC2 = matrixVectorMul(worldTransform, c2);
+				float3 segment = (worldC2 - worldC1).xyz;
+
+
+				/* compute distance of tangent to vertex along line segment in capsule */
+				float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) );
+
+				float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment, 0.f));
+				float distanceFromLine = length(position - closestPoint);
+				float distanceFromC1 = length(worldC1 - position);
+				float distanceFromC2 = length(worldC2 - position);
+					
+				/* Final distance from collision, point to push from, direction to push in
+				   for impulse force */
+				float dist;
+				float3 normalVector;
+				if( distanceAlongSegment < 0 )
+				{
+					dist = distanceFromC1;
+					normalVector = normalize(position - worldC1).xyz;
+				} else if( distanceAlongSegment > 1.f ) {
+					dist = distanceFromC2;
+					normalVector = normalize(position - worldC2).xyz;	
+				} else {
+					dist = distanceFromLine;
+					normalVector = normalize(position - closestPoint).xyz;
+				}
+						
+				float3 colliderLinearVelocity = localCollisionShapes[collision].linearVelocity.xyz;
+				float3 colliderAngularVelocity = localCollisionShapes[collision].angularVelocity.xyz;
+				float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz -  (float3)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w));
+
+				float minDistance = capsuleRadius + capsuleMargin;
+					
+				/* In case of no collision, this is the value of velocity */
+				velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
+					
+					
+				/* Check for a collision */
+				if( dist < minDistance )
+				{
+					/* Project back to surface along normal */
+					position = position + (float4)((minDistance - dist)*normalVector*0.9f, 0.f);
+					velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
+					float3 relativeVelocity = velocity - velocityOfSurfacePoint;
+
+					float3 p1 = normalize(cross(normalVector, segment));
+					float3 p2 = normalize(cross(p1, normalVector));
+					/* Full friction is sum of velocities in each direction of plane */
+					float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2);
+
+					/* Real friction is peak friction corrected by friction coefficients */
+					frictionVector = frictionVector * (colliderFriction*clothFriction);
+
+					float approachSpeed = dot(relativeVelocity, normalVector);
+
+					if( approachSpeed <= 0.0f )
+						forceOnVertex -= frictionVector;
+				}
+					
+			}
+		}
+	} else {
+		/* Update velocity	*/
+		float3 difference = position.xyz - previousPosition.xyz;
+		velocity = difference*velocityCoefficient*isolverdt;			
+	}
+
+	g_vertexVelocities[nodeID] = (float4)(velocity, 0.f);	
+
+	/* Update external force */
+	g_vertexForces[nodeID] = (float4)(forceOnVertex, 0.f);
+
+	g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f);
+}
+
+);
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ComputeBounds.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ComputeBounds.cl
new file mode 100644
index 000000000..f18eada1b
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ComputeBounds.cl
@@ -0,0 +1,82 @@
+MSTRINGIFY(
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n
+
+__kernel void
+ComputeBoundsKernel( 
+	const int numNodes,
+	const int numSoftBodies,
+	__global int * g_vertexClothIdentifier,
+	__global float4 * g_vertexPositions,
+	/* Unfortunately, to get the atomics below to work these arrays cannot be */
+	/* uint4, though that is the layout of the data */
+	/* Therefore this is little-endian-only code */
+	volatile __global uint * g_clothMinBounds,
+	volatile __global uint * g_clothMaxBounds,
+	volatile __local uint * clothMinBounds,
+	volatile __local uint * clothMaxBounds)
+{
+	// Init min and max bounds arrays
+	if( get_local_id(0) < numSoftBodies )
+	{
+		
+		clothMinBounds[get_local_id(0)*4] = UINT_MAX;
+		clothMinBounds[get_local_id(0)*4+1] = UINT_MAX;
+		clothMinBounds[get_local_id(0)*4+2] = UINT_MAX;
+		clothMinBounds[get_local_id(0)*4+3] = UINT_MAX;
+		clothMaxBounds[get_local_id(0)*4] = 0;
+		clothMaxBounds[get_local_id(0)*4+1] = 0;
+		clothMaxBounds[get_local_id(0)*4+2] = 0;
+		clothMaxBounds[get_local_id(0)*4+3] = 0;
+
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{	
+		int clothIdentifier = g_vertexClothIdentifier[nodeID];
+		if( clothIdentifier >= 0 )
+		{
+
+			float4 position = (float4)(g_vertexPositions[nodeID].xyz, 0.f);
+
+			/* Reinterpret position as uint */
+			uint4 positionUInt = (uint4)(as_uint(position.x), as_uint(position.y), as_uint(position.z), 0);
+		
+			/* Invert sign bit of positives and whole of negatives to allow comparison as unsigned ints */
+			positionUInt.x ^= (1+~(positionUInt.x >> 31) | 0x80000000);
+			positionUInt.y ^= (1+~(positionUInt.y >> 31) | 0x80000000);		
+			positionUInt.z ^= (1+~(positionUInt.z >> 31) | 0x80000000);
+		
+			// Min/max with the LDS values
+			atom_min(&(clothMinBounds[clothIdentifier*4]), positionUInt.x);
+			atom_min(&(clothMinBounds[clothIdentifier*4+1]), positionUInt.y);
+			atom_min(&(clothMinBounds[clothIdentifier*4+2]), positionUInt.z);
+
+			atom_max(&(clothMaxBounds[clothIdentifier*4]), positionUInt.x);
+			atom_max(&(clothMaxBounds[clothIdentifier*4+1]), positionUInt.y);
+			atom_max(&(clothMaxBounds[clothIdentifier*4+2]), positionUInt.z);
+		}
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+
+	/* Use global atomics to update the global versions of the data */
+	if( get_local_id(0) < numSoftBodies )
+	{
+		/*atom_min(&(g_clothMinBounds[get_local_id(0)].x), clothMinBounds[get_local_id(0)].x);*/
+		atom_min(&(g_clothMinBounds[get_local_id(0)*4]), clothMinBounds[get_local_id(0)*4]);
+		atom_min(&(g_clothMinBounds[get_local_id(0)*4+1]), clothMinBounds[get_local_id(0)*4+1]);
+		atom_min(&(g_clothMinBounds[get_local_id(0)*4+2]), clothMinBounds[get_local_id(0)*4+2]);
+
+		atom_max(&(g_clothMaxBounds[get_local_id(0)*4]), clothMaxBounds[get_local_id(0)*4]);		
+		atom_max(&(g_clothMaxBounds[get_local_id(0)*4+1]), clothMaxBounds[get_local_id(0)*4+1]);
+		atom_max(&(g_clothMaxBounds[get_local_id(0)*4+2]), clothMaxBounds[get_local_id(0)*4+2]);
+	}
+}
+
+
+);
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/OutputToVertexArray.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/OutputToVertexArray.cl
new file mode 100644
index 000000000..f04e0926e
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/OutputToVertexArray.cl
@@ -0,0 +1,46 @@
+MSTRINGIFY(
+
+__kernel void 
+OutputToVertexArrayWithNormalsKernel( 
+	const int startNode, const int numNodes, __global float *g_vertexBuffer,
+	const int positionOffset, const int positionStride, const __global float4* g_vertexPositions, 
+	const int normalOffset, const int normalStride, const __global float4* g_vertexNormals  )
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{			
+		float4 position = g_vertexPositions[nodeID + startNode];
+		float4 normal = g_vertexNormals[nodeID + startNode];
+		
+		// Stride should account for the float->float4 conversion
+		int positionDestination = nodeID * positionStride + positionOffset;		
+		g_vertexBuffer[positionDestination] = position.x;
+		g_vertexBuffer[positionDestination+1] = position.y;
+		g_vertexBuffer[positionDestination+2] = position.z;
+		
+		int normalDestination = nodeID * normalStride + normalOffset;
+		g_vertexBuffer[normalDestination] = normal.x;
+		g_vertexBuffer[normalDestination+1] = normal.y;
+		g_vertexBuffer[normalDestination+2] = normal.z;		
+	}
+}
+
+__kernel void 
+OutputToVertexArrayWithoutNormalsKernel(
+	const int startNode, const int numNodes, __global float *g_vertexBuffer,
+	const int positionOffset, const int positionStride, const __global float4* g_vertexPositions )
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{			
+		float4 position = g_vertexPositions[nodeID + startNode];
+		
+		// Stride should account for the float->float4 conversion
+		int positionDestination = nodeID * positionStride + positionOffset;		
+		g_vertexBuffer[positionDestination] = position.x;
+		g_vertexBuffer[positionDestination+1] = position.y;
+		g_vertexBuffer[positionDestination+2] = position.z;		
+	}
+}
+
+);
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositionsSIMDBatched.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositionsSIMDBatched.cl
new file mode 100644
index 000000000..aaed72988
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositionsSIMDBatched.cl
@@ -0,0 +1,140 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+MSTRINGIFY(
+
+float mydot3(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+__kernel __attribute__((reqd_work_group_size(WAVEFRONT_BLOCK_MULTIPLIER*WAVEFRONT_SIZE, 1, 1)))
+void 
+SolvePositionsFromLinksKernel( 
+	const int startWaveInBatch,
+	const int numWaves,
+	const float kst,
+	const float ti,
+	__global int2 *g_wavefrontBatchCountsVertexCounts,
+	__global int *g_vertexAddressesPerWavefront,
+	__global int2 * g_linksVertexIndices,
+	__global float * g_linksMassLSC,
+	__global float * g_linksRestLengthSquared,
+	__global float * g_verticesInverseMass,
+	__global float4 * g_vertexPositions,
+	__local int2 *wavefrontBatchCountsVertexCounts,
+	__local float4 *vertexPositionSharedData,
+	__local float *vertexInverseMassSharedData)
+{
+	const int laneInWavefront = (get_global_id(0) & (WAVEFRONT_SIZE-1));
+	const int wavefront = startWaveInBatch + (get_global_id(0) / WAVEFRONT_SIZE);
+	const int firstWavefrontInBlock = startWaveInBatch + get_group_id(0) * WAVEFRONT_BLOCK_MULTIPLIER;
+	const int localWavefront = wavefront - firstWavefrontInBlock;
+
+	// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier	
+	if( wavefront < (startWaveInBatch + numWaves) )
+	{	
+		// Load the batch counts for the wavefronts
+		// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier
+		if( laneInWavefront == 0 )
+		{
+			int2 batchesAndVertexCountsWithinWavefront = g_wavefrontBatchCountsVertexCounts[wavefront];
+			wavefrontBatchCountsVertexCounts[localWavefront] = batchesAndVertexCountsWithinWavefront;
+		}
+
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+
+		int2 batchesAndVerticesWithinWavefront = wavefrontBatchCountsVertexCounts[localWavefront];
+		int batchesWithinWavefront = batchesAndVerticesWithinWavefront.x;
+		int verticesUsedByWave = batchesAndVerticesWithinWavefront.y;
+
+		// Load the vertices for the wavefronts
+		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
+		{
+			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+
+			vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_vertexPositions[vertexAddress];
+			vertexInverseMassSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_verticesInverseMass[vertexAddress];
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop through the batches performing the solve on each in LDS
+		int baseDataLocationForWave = WAVEFRONT_SIZE * wavefront * MAX_BATCHES_PER_WAVE;	
+
+		//for( int batch = 0; batch < batchesWithinWavefront; ++batch )
+		
+		int batch = 0;
+		do
+		{
+			int baseDataLocation = baseDataLocationForWave + WAVEFRONT_SIZE * batch;
+			int locationOfValue = baseDataLocation + laneInWavefront;
+			
+			
+			// These loads should all be perfectly linear across the WF
+			int2 localVertexIndices = g_linksVertexIndices[locationOfValue];
+			float massLSC = g_linksMassLSC[locationOfValue];
+			float restLengthSquared = g_linksRestLengthSquared[locationOfValue];
+			
+			// LDS vertex addresses based on logical wavefront number in block and loaded index
+			int vertexAddress0 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.x;
+			int vertexAddress1 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.y;
+			
+			float4 position0 = vertexPositionSharedData[vertexAddress0];
+			float4 position1 = vertexPositionSharedData[vertexAddress1];
+
+			float inverseMass0 = vertexInverseMassSharedData[vertexAddress0];
+			float inverseMass1 = vertexInverseMassSharedData[vertexAddress1]; 
+
+			float4 del = position1 - position0;
+			float len = mydot3(del, del);
+			
+			float k = 0;
+			if( massLSC > 0.0f )
+			{		
+				k = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
+			}
+			
+			position0 = position0 - del*(k*inverseMass0);
+			position1 = position1 + del*(k*inverseMass1);
+			
+			// Ensure compiler does not re-order memory operations
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			vertexPositionSharedData[vertexAddress0] = position0;
+			vertexPositionSharedData[vertexAddress1] = position1;
+			
+			// Ensure compiler does not re-order memory operations
+			barrier(CLK_LOCAL_MEM_FENCE);
+				
+			
+			++batch;
+		} while( batch < batchesWithinWavefront );
+
+		// Update the global memory vertices for the wavefronts
+		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
+		{
+			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+
+			g_vertexPositions[vertexAddress] = (float4)(vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex].xyz, 0.f);
+		}		
+		
+	}
+
+}
+
+);
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocities.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocities.cl
new file mode 100644
index 000000000..29b04024f
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocities.cl
@@ -0,0 +1,204 @@
+MSTRINGIFY(
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable\n
+
+float mydot3(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+
+typedef struct 
+{
+	int firstObject;
+	int endObject;
+} CollisionObjectIndices;
+
+typedef struct 
+{
+	float4 shapeTransform[4]; // column major 4x4 matrix
+	float4 linearVelocity;
+	float4 angularVelocity;
+
+	int softBodyIdentifier;
+	int collisionShapeType;
+	
+
+	// Shape information
+	// Compressed from the union
+	float radius;
+	float halfHeight;
+	int upAxis;
+		
+	float margin;
+	float friction;
+
+	int padding0;
+	
+} CollisionShapeDescription;
+
+// From btBroadphaseProxy.h
+__constant int CAPSULE_SHAPE_PROXYTYPE = 10;
+
+// Multiply column-major matrix against vector
+float4 matrixVectorMul( float4 matrix[4], float4 vector )
+{
+	float4 returnVector;
+	float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
+	float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
+	float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
+	float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
+	returnVector.x = dot(row0, vector);
+	returnVector.y = dot(row1, vector);
+	returnVector.z = dot(row2, vector);
+	returnVector.w = dot(row3, vector);
+	return returnVector;
+}
+
+__kernel void 
+SolveCollisionsAndUpdateVelocitiesKernel( 
+	const int numNodes,
+	const float isolverdt,
+	__global int *g_vertexClothIdentifier,
+	__global float4 *g_vertexPreviousPositions,
+	__global float * g_perClothFriction,
+	__global float * g_clothDampingFactor,
+	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
+	__global CollisionShapeDescription * g_collisionObjectDetails,
+	__global float4 * g_vertexForces,
+	__global float4 *g_vertexVelocities,
+	__global float4 *g_vertexPositions)
+{
+	int nodeID = get_global_id(0);
+	float4 forceOnVertex = (float4)(0.f, 0.f, 0.f, 0.f);
+	
+	if( get_global_id(0) < numNodes )
+	{	
+		int clothIdentifier = g_vertexClothIdentifier[nodeID];
+		
+		// Abort if this is not a valid cloth
+		if( clothIdentifier < 0 )
+			return;
+
+
+		float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f);
+		float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f);
+			
+		float clothFriction = g_perClothFriction[clothIdentifier];
+		float dampingFactor = g_clothDampingFactor[clothIdentifier];
+		float velocityCoefficient = (1.f - dampingFactor);		
+		float4 difference = position - previousPosition;
+		float4 velocity = difference*velocityCoefficient*isolverdt;
+		
+		CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
+	
+		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
+		
+		if( numObjects > 0 )
+		{
+			// We have some possible collisions to deal with
+			for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision )
+			{
+				CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision];
+				float colliderFriction = shapeDescription.friction;
+
+				if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
+				{
+					// Colliding with a capsule
+
+					float capsuleHalfHeight = shapeDescription.halfHeight;
+					float capsuleRadius = shapeDescription.radius;
+					float capsuleMargin = shapeDescription.margin;
+					int capsuleupAxis = shapeDescription.upAxis;
+
+					// Four columns of worldTransform matrix
+					float4 worldTransform[4];
+					worldTransform[0] = shapeDescription.shapeTransform[0];
+					worldTransform[1] = shapeDescription.shapeTransform[1];
+					worldTransform[2] = shapeDescription.shapeTransform[2];
+					worldTransform[3] = shapeDescription.shapeTransform[3];
+
+					// Correctly define capsule centerline vector 
+					float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); 
+					float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f);
+					c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
+					c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
+					c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
+					c2.x = -c1.x;
+					c2.y = -c1.y;
+					c2.z = -c1.z;
+
+
+					float4 worldC1 = matrixVectorMul(worldTransform, c1);
+					float4 worldC2 = matrixVectorMul(worldTransform, c2);
+					float4 segment = (worldC2 - worldC1);
+
+					// compute distance of tangent to vertex along line segment in capsule
+					float distanceAlongSegment = -( mydot3( (worldC1 - position), segment ) / mydot3(segment, segment) );
+
+					float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment));
+					float distanceFromLine = length(position - closestPoint);
+					float distanceFromC1 = length(worldC1 - position);
+					float distanceFromC2 = length(worldC2 - position);
+					
+					// Final distance from collision, point to push from, direction to push in
+					// for impulse force
+					float dist;
+					float4 normalVector;
+					if( distanceAlongSegment < 0 )
+					{
+						dist = distanceFromC1;
+						normalVector = (float4)(normalize(position - worldC1).xyz, 0.f);
+					} else if( distanceAlongSegment > 1.f ) {
+						dist = distanceFromC2;
+						normalVector = (float4)(normalize(position - worldC2).xyz, 0.f);	
+					} else {
+						dist = distanceFromLine;
+						normalVector = (float4)(normalize(position - closestPoint).xyz, 0.f);
+					}
+						
+					float4 colliderLinearVelocity = shapeDescription.linearVelocity;
+					float4 colliderAngularVelocity = shapeDescription.angularVelocity;
+					float4 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position - (float4)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w, 0.f));
+
+					float minDistance = capsuleRadius + capsuleMargin;
+					
+					// In case of no collision, this is the value of velocity
+					velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
+					
+					
+					// Check for a collision
+					if( dist < minDistance )
+					{
+						// Project back to surface along normal
+						position = position + (float4)((minDistance - dist)*normalVector*0.9f);
+						velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
+						float4 relativeVelocity = velocity - velocityOfSurfacePoint;
+
+						float4 p1 = normalize(cross(normalVector, segment));
+						float4 p2 = normalize(cross(p1, normalVector));
+						// Full friction is sum of velocities in each direction of plane
+						float4 frictionVector = p1*mydot3(relativeVelocity, p1) + p2*mydot3(relativeVelocity, p2);
+
+						// Real friction is peak friction corrected by friction coefficients
+						frictionVector = frictionVector * (colliderFriction*clothFriction);
+
+						float approachSpeed = dot(relativeVelocity, normalVector);
+
+						if( approachSpeed <= 0.0f )
+							forceOnVertex -= frictionVector;
+					}
+				}
+			}
+		}
+
+		g_vertexVelocities[nodeID] = (float4)(velocity.xyz, 0.f);	
+
+		// Update external force
+		g_vertexForces[nodeID] = (float4)(forceOnVertex.xyz, 0.f);
+
+		g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f);
+	}
+}
+
+);
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl
new file mode 100644
index 000000000..ba57c8869
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl
@@ -0,0 +1,219 @@
+MSTRINGIFY(
+float mydot3(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+
+typedef struct 
+{
+	int firstObject;
+	int endObject;
+} CollisionObjectIndices;
+
+typedef struct 
+{
+	float4 shapeTransform[4]; // column major 4x4 matrix
+	float4 linearVelocity;
+	float4 angularVelocity;
+
+	int softBodyIdentifier;
+	int collisionShapeType;
+	
+
+	// Shape information
+	// Compressed from the union
+	float radius;
+	float halfHeight;
+	int upAxis;
+		
+	float margin;
+	float friction;
+
+	int padding0;
+	
+} CollisionShapeDescription;
+
+// From btBroadphaseProxy.h
+__constant int CAPSULE_SHAPE_PROXYTYPE = 10;
+
+
+/* Multiply column-major matrix against vector */
+float4 matrixVectorMul( float4 matrix[4], float4 vector )
+{
+	float4 returnVector;
+	float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
+	float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
+	float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
+	float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
+	returnVector.x = dot(row0, vector);
+	returnVector.y = dot(row1, vector);
+	returnVector.z = dot(row2, vector);
+	returnVector.w = dot(row3, vector);
+	return returnVector;
+}
+
+__kernel void 
+SolveCollisionsAndUpdateVelocitiesKernel( 
+	const int numNodes,
+	const float isolverdt,
+	__global int *g_vertexClothIdentifier,
+	__global float4 *g_vertexPreviousPositions,
+	__global float * g_perClothFriction,
+	__global float * g_clothDampingFactor,
+	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
+	__global CollisionShapeDescription * g_collisionObjectDetails,
+	__global float4 * g_vertexForces,
+	__global float4 *g_vertexVelocities,
+	__global float4 *g_vertexPositions,
+	__local CollisionShapeDescription *localCollisionShapes)
+{
+	int nodeID = get_global_id(0);
+	float4 forceOnVertex = (float4)(0.f, 0.f, 0.f, 0.f);
+
+	int clothIdentifier = g_vertexClothIdentifier[nodeID];
+
+	// Abort if this is not a valid cloth
+	if( clothIdentifier < 0 )
+		return;
+	
+
+	float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f);
+	float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f);
+	float clothFriction = g_perClothFriction[clothIdentifier];
+	float dampingFactor = g_clothDampingFactor[clothIdentifier];
+	float velocityCoefficient = (1.f - dampingFactor);		
+	
+	// Update velocity	
+	float4 difference = position - previousPosition;
+	float4 velocity = difference*velocityCoefficient*isolverdt;			
+	CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
+	
+	int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
+	if( numObjects > 0 )
+	{
+		// We have some possible collisions to deal with
+		
+		// First load all of the collision objects into LDS
+		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
+		if( get_local_id(0) < numObjects )
+		{
+			localCollisionShapes[get_local_id(0)] = g_collisionObjectDetails[ collisionObjectIndices.firstObject + get_local_id(0) ];
+		}
+	}
+
+	// Safe as the vertices are padded so that not more than one soft body is in a group
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	// Annoyingly, even though I know the flow control is not varying, the compiler will not let me skip this
+	if( numObjects > 0 )
+	{
+		
+		
+		// We have some possible collisions to deal with
+		for( int collision = 0; collision < numObjects; ++collision )
+		{
+			//CollisionShapeDescription shapeDescription = localCollisionShapes[collision];
+			float colliderFriction = localCollisionShapes[collision].friction;
+		
+			if( localCollisionShapes[collision].collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
+			{
+				// Colliding with a capsule
+
+				float capsuleHalfHeight = localCollisionShapes[collision].halfHeight;
+				float capsuleRadius = localCollisionShapes[collision].radius;
+				float capsuleMargin = localCollisionShapes[collision].margin;
+				int capsuleupAxis = localCollisionShapes[collision].upAxis;
+
+				float4 worldTransform[4];
+				worldTransform[0] = localCollisionShapes[collision].shapeTransform[0];
+				worldTransform[1] = localCollisionShapes[collision].shapeTransform[1];
+				worldTransform[2] = localCollisionShapes[collision].shapeTransform[2];
+				worldTransform[3] = localCollisionShapes[collision].shapeTransform[3];
+
+				//float4 c1 = (float4)(0.f, -capsuleHalfHeight, 0.f, 1.f); 
+				//float4 c2 = (float4)(0.f, +capsuleHalfHeight, 0.f, 1.f);
+				// Correctly define capsule centerline vector 
+				float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); 
+				float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f);
+				c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
+				c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
+				c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
+				c2.x = -c1.x;
+				c2.y = -c1.y;
+				c2.z = -c1.z;
+
+				float4 worldC1 = matrixVectorMul(worldTransform, c1);
+				float4 worldC2 = matrixVectorMul(worldTransform, c2);
+				float4 segment = (worldC2 - worldC1);
+
+
+				// compute distance of tangent to vertex along line segment in capsule
+				float distanceAlongSegment = -( mydot3( (worldC1 - position), segment ) / mydot3(segment, segment) );
+
+				float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment));
+				float distanceFromLine = length(position - closestPoint);
+				float distanceFromC1 = length(worldC1 - position);
+				float distanceFromC2 = length(worldC2 - position);
+					
+				// Final distance from collision, point to push from, direction to push in
+				// for impulse force
+				float dist;
+				float4 normalVector;
+				if( distanceAlongSegment < 0 )
+				{
+					dist = distanceFromC1;
+					normalVector = normalize(position - worldC1);
+				} else if( distanceAlongSegment > 1.f ) {
+					dist = distanceFromC2;
+					normalVector = normalize(position - worldC2);	
+				} else {
+					dist = distanceFromLine;
+					normalVector = normalize(position - closestPoint);
+				}
+						
+				float4 colliderLinearVelocity = localCollisionShapes[collision].linearVelocity;
+				float4 colliderAngularVelocity = localCollisionShapes[collision].angularVelocity;
+				float4 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position - (float4)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w, 0.f));
+
+				float minDistance = capsuleRadius + capsuleMargin;
+					
+				// In case of no collision, this is the value of velocity
+				velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
+					
+					
+				// Check for a collision
+				if( dist < minDistance )
+				{
+					// Project back to surface along normal
+					position = position + (float4)((minDistance - dist)*normalVector*0.9f);
+					velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
+					float4 relativeVelocity = velocity - velocityOfSurfacePoint;
+
+					float4 p1 = (float4)(normalize(cross(normalVector, segment)).xyz, 0.f);
+					float4 p2 = (float4)(normalize(cross(p1, normalVector)).xyz, 0.f);
+					// Full friction is sum of velocities in each direction of plane
+					float4 frictionVector = p1*mydot3(relativeVelocity, p1) + p2*mydot3(relativeVelocity, p2);
+
+					// Real friction is peak friction corrected by friction coefficients
+					frictionVector = frictionVector * (colliderFriction*clothFriction);
+
+					float approachSpeed = dot(relativeVelocity, normalVector);
+
+					if( approachSpeed <= 0.0f )
+						forceOnVertex -= frictionVector;
+				}
+					
+			}
+		}
+	}
+	
+	g_vertexVelocities[nodeID] = (float4)(velocity.xyz, 0.f);	
+
+	// Update external force
+	g_vertexForces[nodeID] = (float4)(forceOnVertex.xyz, 0.f);
+
+	g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f);
+}
+
+);
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCLSIMDAware.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCLSIMDAware.h
new file mode 100644
index 000000000..5e3511040
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCLSIMDAware.h
@@ -0,0 +1,169 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
+
+
+#ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_SIMDAWARE_H
+#define BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_SIMDAWARE_H
+
+
+class btSoftBodyLinkDataOpenCLSIMDAware : public btSoftBodyLinkData
+{
+public:
+	bool				m_onGPU;
+
+	cl_command_queue	m_cqCommandQue;
+
+	const int m_wavefrontSize;
+	const int m_linksPerWorkItem;
+	const int m_maxLinksPerWavefront;
+	int m_maxBatchesWithinWave;
+	int m_maxVerticesWithinWave;
+	int m_numWavefronts;
+
+	int m_maxVertex;
+
+	struct NumBatchesVerticesPair
+	{
+		int numBatches;
+		int numVertices;
+	};
+
+	btAlignedObjectArray<int>							  m_linksPerWavefront;
+	btAlignedObjectArray<NumBatchesVerticesPair>		  m_numBatchesAndVerticesWithinWaves;
+	btOpenCLBuffer< NumBatchesVerticesPair >			  m_clNumBatchesAndVerticesWithinWaves;
+
+	// All arrays here will contain batches of m_maxLinksPerWavefront links
+	// ordered by wavefront.
+	// with either global vertex pairs or local vertex pairs
+	btAlignedObjectArray< int >							  m_wavefrontVerticesGlobalAddresses; // List of global vertices per wavefront
+	btOpenCLBuffer<int>									  m_clWavefrontVerticesGlobalAddresses;
+	btAlignedObjectArray< LinkNodePair >				  m_linkVerticesLocalAddresses; // Vertex pair for the link
+	btOpenCLBuffer<LinkNodePair>						  m_clLinkVerticesLocalAddresses;
+	btOpenCLBuffer<float>							      m_clLinkStrength;
+	btOpenCLBuffer<float>								  m_clLinksMassLSC;
+	btOpenCLBuffer<float>								  m_clLinksRestLengthSquared;
+	btOpenCLBuffer<float>								  m_clLinksRestLength;
+	btOpenCLBuffer<float>								  m_clLinksMaterialLinearStiffnessCoefficient;
+
+	struct BatchPair
+	{
+		int start;
+		int length;
+
+		BatchPair() :
+			start(0),
+			length(0)
+		{
+		}
+
+		BatchPair( int s, int l ) : 
+			start( s ),
+			length( l )
+		{
+		}
+	};
+
+	/**
+	 * Link addressing information for each cloth.
+	 * Allows link locations to be computed independently of data batching.
+	 */
+	btAlignedObjectArray< int >							m_linkAddresses;
+	
+	/**
+	 * Start and length values for computation batches over link data.
+	 */
+	btAlignedObjectArray< BatchPair >		m_wavefrontBatchStartLengths;
+
+	btSoftBodyLinkDataOpenCLSIMDAware(cl_command_queue queue, cl_context ctx);
+
+	virtual ~btSoftBodyLinkDataOpenCLSIMDAware();
+
+	/** Allocate enough space in all link-related arrays to fit numLinks links */
+	virtual void createLinks( int numLinks );
+	
+	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
+	virtual void setLinkAt( 
+		const LinkDescription &link, 
+		int linkIndex );
+
+	virtual bool onAccelerator();
+
+	virtual bool moveToAccelerator();
+
+	virtual bool moveFromAccelerator();
+
+	/**
+	 * Generate (and later update) the batching for the entire link set.
+	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
+	 * In theory we could delay it until just before we need the cloth.
+	 * It's a one-off overhead, though, so that is a later optimisation.
+	 */
+	void generateBatches();
+
+	int getMaxVerticesPerWavefront()
+	{
+		return m_maxVerticesWithinWave;
+	}
+
+	int getWavefrontSize()
+	{
+		return m_wavefrontSize;
+	}
+
+	int getLinksPerWorkItem()
+	{
+		return m_linksPerWorkItem;
+	}
+
+	int getMaxLinksPerWavefront()
+	{
+		return m_maxLinksPerWavefront;
+	}
+
+	int getMaxBatchesPerWavefront()
+	{
+		return m_maxBatchesWithinWave;
+	}
+
+	int getNumWavefronts()
+	{
+		return m_numWavefronts;
+	}
+
+	NumBatchesVerticesPair getNumBatchesAndVerticesWithinWavefront( int wavefront )
+	{
+		return m_numBatchesAndVerticesWithinWaves[wavefront];
+	}
+
+	int getVertexGlobalAddresses( int vertexIndex )
+	{
+		return m_wavefrontVerticesGlobalAddresses[vertexIndex];
+	}
+
+	/**
+	 * Get post-batching local addresses of the vertex pair for a link assuming all vertices used by a wavefront are loaded locally.
+	 */
+	LinkNodePair getVertexPairLocalAddresses( int linkIndex )
+	{
+		return m_linkVerticesLocalAddresses[linkIndex];
+	}
+};
+
+
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_SIMDAWARE_H
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp
new file mode 100644
index 000000000..b00795a9a
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp
@@ -0,0 +1,133 @@
+#include "btSoftBodySolverOutputCLtoGL.h"
+#include <stdio.h> //@todo: remove the debugging printf at some stage
+#include "btSoftBodySolver_OpenCL.h"
+#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
+#include "btSoftBodySolverVertexBuffer_OpenGL.h"
+#include "BulletSoftBody/btSoftBody.h"
+
+#if (0)//CL_VERSION_1_1 == 1)
+ //OpenCL 1.1 kernels use float3
+#define MSTRINGIFY(A) #A
+static char* OutputToVertexArrayCLString =
+#include "OpenCLC/OutputToVertexArray.cl"
+#else
+////OpenCL 1.0 kernels don't use float3
+#define MSTRINGIFY(A) #A
+static char* OutputToVertexArrayCLString =
+#include "OpenCLC10/OutputToVertexArray.cl"
+#endif //CL_VERSION_1_1
+
+	
+#define RELEASE_CL_KERNEL(kernelName) {if( kernelName ){ clReleaseKernel( kernelName ); kernelName = 0; }}
+
+static const size_t workGroupSize = 128;
+
+void btSoftBodySolverOutputCLtoGL::copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer )
+{
+
+	btSoftBodySolver *solver = softBody->getSoftBodySolver();
+	btAssert( solver->getSolverType() == btSoftBodySolver::CL_SOLVER || solver->getSolverType() == btSoftBodySolver::CL_SIMD_SOLVER );
+	btOpenCLSoftBodySolver *dxSolver = static_cast< btOpenCLSoftBodySolver * >( solver );
+	checkInitialized();
+	btOpenCLAcceleratedSoftBodyInterface* currentCloth = dxSolver->findSoftBodyInterface( softBody );
+	btSoftBodyVertexDataOpenCL &vertexData( dxSolver->m_vertexData );	
+
+	const int firstVertex = currentCloth->getFirstVertex();
+	const int lastVertex = firstVertex + currentCloth->getNumVertices();
+
+	if( vertexBuffer->getBufferType() == btVertexBufferDescriptor::OPENGL_BUFFER ) {		
+
+		const btOpenGLInteropVertexBufferDescriptor *openGLVertexBuffer = static_cast< btOpenGLInteropVertexBufferDescriptor* >(vertexBuffer);						
+		cl_int ciErrNum = CL_SUCCESS;    
+
+		cl_mem clBuffer = openGLVertexBuffer->getBuffer();		
+		cl_kernel outputKernel = outputToVertexArrayWithNormalsKernel;
+		if( !vertexBuffer->hasNormals() )
+			outputKernel = outputToVertexArrayWithoutNormalsKernel;
+
+		ciErrNum = clEnqueueAcquireGLObjects(m_cqCommandQue, 1, &clBuffer, 0, 0, NULL);
+		if( ciErrNum != CL_SUCCESS )
+		{
+			btAssert( 0 &&  "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)");
+		}
+
+		int numVertices = currentCloth->getNumVertices();
+
+		ciErrNum = clSetKernelArg(outputKernel, 0, sizeof(int), &firstVertex );
+		ciErrNum = clSetKernelArg(outputKernel, 1, sizeof(int), &numVertices );
+		ciErrNum = clSetKernelArg(outputKernel, 2, sizeof(cl_mem), (void*)&clBuffer );
+		if( vertexBuffer->hasVertexPositions() )
+		{
+			int vertexOffset = vertexBuffer->getVertexOffset();
+			int vertexStride = vertexBuffer->getVertexStride();
+			ciErrNum = clSetKernelArg(outputKernel, 3, sizeof(int), &vertexOffset );
+			ciErrNum = clSetKernelArg(outputKernel, 4, sizeof(int), &vertexStride );
+			ciErrNum = clSetKernelArg(outputKernel, 5, sizeof(cl_mem), (void*)&vertexData.m_clVertexPosition.m_buffer );
+
+		}
+		if( vertexBuffer->hasNormals() )
+		{
+			int normalOffset = vertexBuffer->getNormalOffset();
+			int normalStride = vertexBuffer->getNormalStride();
+			ciErrNum = clSetKernelArg(outputKernel, 6, sizeof(int), &normalOffset );
+			ciErrNum = clSetKernelArg(outputKernel, 7, sizeof(int), &normalStride );
+			ciErrNum = clSetKernelArg(outputKernel, 8, sizeof(cl_mem), (void*)&vertexData.m_clVertexNormal.m_buffer );
+
+		}
+		size_t	numWorkItems = workGroupSize*((vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, outputKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
+		if( ciErrNum != CL_SUCCESS ) 
+		{
+			btAssert( 0 &&  "enqueueNDRangeKernel(copySoftBodyToVertexBuffer)");
+		}
+
+		ciErrNum = clEnqueueReleaseGLObjects(m_cqCommandQue, 1, &clBuffer, 0, 0, 0);
+		if( ciErrNum != CL_SUCCESS )
+		{
+			btAssert( 0 &&  "clEnqueueReleaseGLObjects(copySoftBodyToVertexBuffer)");
+		}
+	} else {
+		btAssert( "Undefined output for this solver output" == false );
+	}
+
+	// clFinish in here may not be the best thing. It's possible that we should have a waitForFrameComplete function.
+	clFinish(m_cqCommandQue);
+
+} // btSoftBodySolverOutputCLtoGL::outputToVertexBuffers
+
+bool btSoftBodySolverOutputCLtoGL::buildShaders()
+{
+	// Ensure current kernels are released first
+	releaseKernels();
+
+	bool returnVal = true;
+
+	if( m_shadersInitialized )
+		return true;
+	
+	outputToVertexArrayWithNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithNormalsKernel" );
+	outputToVertexArrayWithoutNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithoutNormalsKernel" );
+
+
+	if( returnVal )
+		m_shadersInitialized = true;
+
+	return returnVal;
+} // btSoftBodySolverOutputCLtoGL::buildShaders
+
+void btSoftBodySolverOutputCLtoGL::releaseKernels()
+{
+	RELEASE_CL_KERNEL( outputToVertexArrayWithNormalsKernel );
+	RELEASE_CL_KERNEL( outputToVertexArrayWithoutNormalsKernel );
+
+	m_shadersInitialized = false;
+} // btSoftBodySolverOutputCLtoGL::releaseKernels
+
+bool btSoftBodySolverOutputCLtoGL::checkInitialized()
+{
+	if( !m_shadersInitialized )
+		if( buildShaders() )
+			m_shadersInitialized = true;
+
+	return m_shadersInitialized;
+}
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.h
new file mode 100644
index 000000000..45279b0c9
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.h
@@ -0,0 +1,62 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SOFT_BODY_SOLVER_OUTPUT_CL_TO_GL_H
+#define BT_SOFT_BODY_SOLVER_OUTPUT_CL_TO_GL_H
+
+#include "btSoftBodySolver_OpenCL.h"
+
+/** 
+ * Class to manage movement of data from a solver to a given target.
+ * This version is the CL to GL interop version.
+ */
+class btSoftBodySolverOutputCLtoGL : public btSoftBodySolverOutput
+{
+protected:
+	cl_command_queue	m_cqCommandQue;
+	cl_context			m_cxMainContext;
+	CLFunctions			clFunctions;
+	
+	cl_kernel		outputToVertexArrayWithNormalsKernel;
+	cl_kernel		outputToVertexArrayWithoutNormalsKernel;
+
+	bool m_shadersInitialized;
+	
+	virtual bool checkInitialized();	
+	virtual bool buildShaders();
+	void releaseKernels();
+public:
+	btSoftBodySolverOutputCLtoGL(cl_command_queue cqCommandQue, cl_context cxMainContext) :
+		m_cqCommandQue( cqCommandQue ),
+		m_cxMainContext( cxMainContext ),
+		clFunctions(cqCommandQue, cxMainContext),
+		outputToVertexArrayWithNormalsKernel( 0 ),
+		outputToVertexArrayWithoutNormalsKernel( 0 ),
+		m_shadersInitialized( false )
+	{
+	}
+
+	virtual ~btSoftBodySolverOutputCLtoGL()
+	{
+		releaseKernels();
+	}
+
+	/** Output current computed vertex data to the vertex buffers for all cloths in the solver. */
+	virtual void copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer );
+};
+
+
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_OUTPUT_CL_TO_GL_H
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexBuffer_OpenGL.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexBuffer_OpenGL.h
new file mode 100644
index 000000000..4d5ae31a4
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexBuffer_OpenGL.h
@@ -0,0 +1,168 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_OPENGL_H
+#define BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_OPENGL_H 
+
+
+#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
+#ifdef USE_MINICL
+	#include "MiniCL/cl.h"
+#else //USE_MINICL
+	#ifdef __APPLE__
+		#include <OpenCL/OpenCL.h>
+	#else
+		#include <CL/cl.h>
+	#endif //__APPLE__
+#endif//USE_MINICL
+#ifndef USE_MINICL
+#include <CL/cl_gl.h>
+#endif //USE_MINICL
+
+
+#ifdef _WIN32//for glut.h
+#include <windows.h>
+#endif
+
+//think different
+#if defined(__APPLE__) && !defined (VMDMESA)
+#include <OpenGL/OpenGL.h>
+#include <OpenGL/gl.h>
+#include <OpenGL/glu.h>
+#include <GLUT/glut.h>
+#else
+
+
+#ifdef _WINDOWS
+#include <windows.h>
+#include <GL/gl.h>
+#include <GL/glu.h>
+#else
+#include <GL/glut.h>
+#endif //_WINDOWS
+#endif //APPLE
+
+
+
+class btOpenGLInteropVertexBufferDescriptor : public btVertexBufferDescriptor
+{
+protected:
+	/** OpenCL context */
+	cl_context			m_context;
+
+	/** OpenCL command queue */
+	cl_command_queue	m_commandQueue;
+	
+	/** OpenCL interop buffer */
+	cl_mem m_buffer;
+
+	/** VBO in GL that is the basis of the interop buffer */
+	GLuint m_openGLVBO;
+
+
+public:
+	/**
+	 * context is the OpenCL context this interop buffer will work in.
+	 * queue is the command queue that kernels and data movement will be enqueued into.
+	 * openGLVBO is the OpenGL vertex buffer data will be copied into.
+	 * vertexOffset is the offset in floats to the first vertex.
+	 * vertexStride is the stride in floats between vertices.
+	 */
+	btOpenGLInteropVertexBufferDescriptor( cl_command_queue cqCommandQue, cl_context context, GLuint openGLVBO, int vertexOffset, int vertexStride )
+	{
+#ifndef USE_MINICL
+		cl_int ciErrNum = CL_SUCCESS;
+		m_context = context;
+		m_commandQueue = cqCommandQue;
+		
+		m_vertexOffset = vertexOffset;
+		m_vertexStride = vertexStride;
+
+		m_openGLVBO = openGLVBO;
+		
+		m_buffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, openGLVBO, &ciErrNum);
+		if( ciErrNum != CL_SUCCESS )
+		{
+			btAssert( 0 &&  "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)");
+		}
+
+		m_hasVertexPositions = true;
+#else
+		btAssert(0);//MiniCL shouldn't get here
+#endif
+	}
+
+	/**
+	 * context is the OpenCL context this interop buffer will work in.
+	 * queue is the command queue that kernels and data movement will be enqueued into.
+	 * openGLVBO is the OpenGL vertex buffer data will be copied into.
+	 * vertexOffset is the offset in floats to the first vertex.
+	 * vertexStride is the stride in floats between vertices.
+	 * normalOffset is the offset in floats to the first normal.
+	 * normalStride is the stride in floats between normals.
+	 */
+	btOpenGLInteropVertexBufferDescriptor( cl_command_queue cqCommandQue, cl_context context, GLuint openGLVBO, int vertexOffset, int vertexStride, int normalOffset, int normalStride )
+	{
+#ifndef USE_MINICL
+		cl_int ciErrNum = CL_SUCCESS;
+		m_context = context;
+		m_commandQueue = cqCommandQue;
+		
+		m_openGLVBO = openGLVBO;
+		
+		m_buffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, openGLVBO, &ciErrNum);
+		if( ciErrNum != CL_SUCCESS )
+		{
+			btAssert( 0 &&  "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)");
+		}
+
+		m_vertexOffset = vertexOffset;
+		m_vertexStride = vertexStride;
+		m_hasVertexPositions = true;
+
+		m_normalOffset = normalOffset;
+		m_normalStride = normalStride;
+		m_hasNormals = true;
+#else
+		btAssert(0);
+#endif //USE_MINICL
+		
+	}
+
+	virtual ~btOpenGLInteropVertexBufferDescriptor()
+	{
+		clReleaseMemObject( m_buffer );
+	}
+
+	/**
+	 * Return the type of the vertex buffer descriptor.
+	 */
+	virtual BufferTypes getBufferType() const
+	{
+		return OPENGL_BUFFER;
+	}
+
+	virtual cl_context getContext() const
+	{
+		return m_context;
+	}
+
+	virtual cl_mem getBuffer() const
+	{
+		return m_buffer;
+	}	
+};
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_OPENGL_H
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
new file mode 100644
index 000000000..aeee3364a
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
@@ -0,0 +1,1063 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
+#include "vectormath/vmInclude.h"
+#include <stdio.h> //@todo: remove the debugging printf at some stage
+#include "btSoftBodySolver_OpenCLSIMDAware.h"
+#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
+#include "BulletSoftBody/btSoftBody.h"
+#include "BulletCollision/CollisionShapes/btCapsuleShape.h"
+
+#define WAVEFRONT_SIZE 32
+#define WAVEFRONT_BLOCK_MULTIPLIER 2
+#define GROUP_SIZE (WAVEFRONT_SIZE*WAVEFRONT_BLOCK_MULTIPLIER)
+#define LINKS_PER_SIMD_LANE 16
+
+static const size_t workGroupSize = GROUP_SIZE;
+
+
+//CL_VERSION_1_1 seems broken on NVidia SDK so just disable it
+
+#if (0)//CL_VERSION_1_1 == 1)
+ //OpenCL 1.1 kernels use float3
+#define MSTRINGIFY(A) #A
+static char* UpdatePositionsFromVelocitiesCLString = 
+#include "OpenCLC/UpdatePositionsFromVelocities.cl"
+static char* SolvePositionsCLString = 
+#include "OpenCLC/SolvePositionsSIMDBatched.cl"
+static char* UpdateNodesCLString = 
+#include "OpenCLC/UpdateNodes.cl"
+static char* UpdatePositionsCLString = 
+#include "OpenCLC/UpdatePositions.cl"
+static char* UpdateConstantsCLString = 
+#include "OpenCLC/UpdateConstants.cl"
+static char* IntegrateCLString = 
+#include "OpenCLC/Integrate.cl"
+static char* ApplyForcesCLString = 
+#include "OpenCLC/ApplyForces.cl"
+static char* UpdateNormalsCLString = 
+#include "OpenCLC/UpdateNormals.cl"
+static char* VSolveLinksCLString = 
+#include "OpenCLC/VSolveLinks.cl"
+static char* ComputeBoundsCLString = 
+#include "OpenCLC/ComputeBounds.cl"
+static char* SolveCollisionsAndUpdateVelocitiesCLString =
+#include "OpenCLC/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl"
+static char* OutputToVertexArrayCLString =
+#include "OpenCLC/OutputToVertexArray.cl"
+#else
+////OpenCL 1.0 kernels don't use float3
+#define MSTRINGIFY(A) #A
+static char* UpdatePositionsFromVelocitiesCLString = 
+#include "OpenCLC10/UpdatePositionsFromVelocities.cl"
+static char* SolvePositionsCLString = 
+#include "OpenCLC10/SolvePositionsSIMDBatched.cl"
+static char* UpdateNodesCLString = 
+#include "OpenCLC10/UpdateNodes.cl"
+static char* UpdatePositionsCLString = 
+#include "OpenCLC10/UpdatePositions.cl"
+static char* UpdateConstantsCLString = 
+#include "OpenCLC10/UpdateConstants.cl"
+static char* IntegrateCLString = 
+#include "OpenCLC10/Integrate.cl"
+static char* ApplyForcesCLString = 
+#include "OpenCLC10/ApplyForces.cl"
+static char* UpdateNormalsCLString = 
+#include "OpenCLC10/UpdateNormals.cl"
+static char* VSolveLinksCLString = 
+#include "OpenCLC10/VSolveLinks.cl"
+static char* ComputeBoundsCLString = 
+#include "OpenCLC10/ComputeBounds.cl"
+static char* SolveCollisionsAndUpdateVelocitiesCLString =
+#include "OpenCLC10/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl"
+static char* OutputToVertexArrayCLString =
+#include "OpenCLC10/OutputToVertexArray.cl"
+#endif //CL_VERSION_1_1
+
+
+
+btSoftBodyLinkDataOpenCLSIMDAware::btSoftBodyLinkDataOpenCLSIMDAware(cl_command_queue queue,  cl_context ctx) :
+	m_cqCommandQue(queue),
+	m_wavefrontSize( WAVEFRONT_SIZE ),
+	m_linksPerWorkItem( LINKS_PER_SIMD_LANE ),
+	m_maxBatchesWithinWave( 0 ),
+	m_maxLinksPerWavefront( m_wavefrontSize * m_linksPerWorkItem ),
+	m_numWavefronts( 0 ),
+	m_maxVertex( 0 ),
+	m_clNumBatchesAndVerticesWithinWaves( queue, ctx, &m_numBatchesAndVerticesWithinWaves, true ),
+	m_clWavefrontVerticesGlobalAddresses( queue, ctx, &m_wavefrontVerticesGlobalAddresses, true ),
+	m_clLinkVerticesLocalAddresses( queue, ctx, &m_linkVerticesLocalAddresses, true ),
+	m_clLinkStrength( queue, ctx, &m_linkStrength, false ),
+	m_clLinksMassLSC( queue, ctx, &m_linksMassLSC, false ),
+	m_clLinksRestLengthSquared( queue, ctx, &m_linksRestLengthSquared, false ),
+	m_clLinksRestLength( queue, ctx, &m_linksRestLength, false ),
+	m_clLinksMaterialLinearStiffnessCoefficient( queue, ctx, &m_linksMaterialLinearStiffnessCoefficient, false )
+{
+}
+
+btSoftBodyLinkDataOpenCLSIMDAware::~btSoftBodyLinkDataOpenCLSIMDAware()
+{
+}
+
+static Vectormath::Aos::Vector3 toVector3( const btVector3 &vec )
+{
+	Vectormath::Aos::Vector3 outVec( vec.getX(), vec.getY(), vec.getZ() );
+	return outVec;
+}
+
+/** Allocate enough space in all link-related arrays to fit numLinks links */
+void btSoftBodyLinkDataOpenCLSIMDAware::createLinks( int numLinks )
+{
+	int previousSize = m_links.size();
+	int newSize = previousSize + numLinks;
+
+	btSoftBodyLinkData::createLinks( numLinks );
+
+	// Resize the link addresses array as well
+	m_linkAddresses.resize( newSize );
+}
+
+/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
+void btSoftBodyLinkDataOpenCLSIMDAware::setLinkAt( 
+	const LinkDescription &link, 
+	int linkIndex )
+{
+	btSoftBodyLinkData::setLinkAt( link, linkIndex );
+
+	if( link.getVertex0() > m_maxVertex )
+		m_maxVertex = link.getVertex0();
+	if( link.getVertex1() > m_maxVertex )
+		m_maxVertex = link.getVertex1();
+
+	// Set the link index correctly for initialisation
+	m_linkAddresses[linkIndex] = linkIndex;
+}
+
+bool btSoftBodyLinkDataOpenCLSIMDAware::onAccelerator()
+{
+	return m_onGPU;
+}
+
+bool btSoftBodyLinkDataOpenCLSIMDAware::moveToAccelerator()
+{
+	bool success = true;
+	success = success && m_clNumBatchesAndVerticesWithinWaves.moveToGPU();
+	success = success && m_clWavefrontVerticesGlobalAddresses.moveToGPU();
+	success = success && m_clLinkVerticesLocalAddresses.moveToGPU();
+	success = success && m_clLinkStrength.moveToGPU();
+	success = success && m_clLinksMassLSC.moveToGPU();
+	success = success && m_clLinksRestLengthSquared.moveToGPU();
+	success = success && m_clLinksRestLength.moveToGPU();
+	success = success && m_clLinksMaterialLinearStiffnessCoefficient.moveToGPU();
+
+	if( success ) {
+		m_onGPU = true;
+	}
+
+	return success;
+}
+
+bool btSoftBodyLinkDataOpenCLSIMDAware::moveFromAccelerator()
+{
+	bool success = true;
+	success = success && m_clNumBatchesAndVerticesWithinWaves.moveToGPU();
+	success = success && m_clWavefrontVerticesGlobalAddresses.moveToGPU();
+	success = success && m_clLinkVerticesLocalAddresses.moveToGPU();
+	success = success && m_clLinkStrength.moveFromGPU();
+	success = success && m_clLinksMassLSC.moveFromGPU();
+	success = success && m_clLinksRestLengthSquared.moveFromGPU();
+	success = success && m_clLinksRestLength.moveFromGPU();
+	success = success && m_clLinksMaterialLinearStiffnessCoefficient.moveFromGPU();
+
+	if( success ) {
+		m_onGPU = false;
+	}
+
+	return success;
+}
+
+
+
+
+
+
+
+
+btOpenCLSoftBodySolverSIMDAware::btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue, cl_context ctx) :
+	btOpenCLSoftBodySolver( queue, ctx ),
+	m_linkData(queue, ctx)
+{
+	// Initial we will clearly need to update solver constants
+	// For now this is global for the cloths linked with this solver - we should probably make this body specific 
+	// for performance in future once we understand more clearly when constants need to be updated
+	m_updateSolverConstants = true;
+
+	m_shadersInitialized = false;
+}
+
+btOpenCLSoftBodySolverSIMDAware::~btOpenCLSoftBodySolverSIMDAware()
+{
+	releaseKernels();
+}
+
+void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody * > &softBodies ,bool forceUpdate)
+{
+	if( forceUpdate|| m_softBodySet.size() != softBodies.size() )
+	{
+		// Have a change in the soft body set so update, reloading all the data
+		getVertexData().clear();
+		getTriangleData().clear();
+		getLinkData().clear();
+		m_softBodySet.resize(0);
+
+
+		for( int softBodyIndex = 0; softBodyIndex < softBodies.size(); ++softBodyIndex )
+		{
+			btSoftBody *softBody = softBodies[ softBodyIndex ];
+			using Vectormath::Aos::Matrix3;
+			using Vectormath::Aos::Point3;
+
+			// Create SoftBody that will store the information within the solver
+			btOpenCLAcceleratedSoftBodyInterface* newSoftBody = new btOpenCLAcceleratedSoftBodyInterface( softBody );
+			m_softBodySet.push_back( newSoftBody );
+
+			m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) );
+			m_perClothDampingFactor.push_back(softBody->m_cfg.kDP);
+			m_perClothVelocityCorrectionCoefficient.push_back( softBody->m_cfg.kVCF );
+			m_perClothLiftFactor.push_back( softBody->m_cfg.kLF );
+			m_perClothDragFactor.push_back( softBody->m_cfg.kDG );
+			m_perClothMediumDensity.push_back(softBody->getWorldInfo()->air_density);
+			// Simple init values. Actually we'll put 0 and -1 into them at the appropriate time
+			m_perClothMinBounds.push_back( UIntVector3(UINT_MAX, UINT_MAX, UINT_MAX) );
+			m_perClothMaxBounds.push_back( UIntVector3(0, 0, 0) );
+			/*m_perClothMinBounds.push_back( UINT_MAX );
+			m_perClothMaxBounds.push_back( 0 );
+			m_perClothMinBounds.push_back( UINT_MAX );
+			m_perClothMaxBounds.push_back( 0 );
+			m_perClothMinBounds.push_back( UINT_MAX );
+			m_perClothMaxBounds.push_back( 0 );
+			m_perClothMinBounds.push_back( UINT_MAX );
+			m_perClothMaxBounds.push_back( 0 );*/
+
+
+			m_perClothFriction.push_back( softBody->getFriction() );
+			m_perClothCollisionObjects.push_back( CollisionObjectIndices(-1, -1) );
+
+			// Add space for new vertices and triangles in the default solver for now
+			// TODO: Include space here for tearing too later
+			int firstVertex = getVertexData().getNumVertices();
+			int numVertices = softBody->m_nodes.size();
+			// Round maxVertices to a multiple of the workgroup size so we know we're safe to run over in a given group
+			// maxVertices can be increased to allow tearing, but should be used sparingly because these extra verts will always be processed
+			int maxVertices = GROUP_SIZE*((numVertices+GROUP_SIZE)/GROUP_SIZE);
+			// Allocate space for new vertices in all the vertex arrays
+			getVertexData().createVertices( numVertices, softBodyIndex, maxVertices );
+
+			int firstTriangle = getTriangleData().getNumTriangles();
+			int numTriangles = softBody->m_faces.size();
+			int maxTriangles = numTriangles;
+			getTriangleData().createTriangles( maxTriangles );
+
+			// Copy vertices from softbody into the solver
+			for( int vertex = 0; vertex < numVertices; ++vertex )
+			{
+				Point3 multPoint(softBody->m_nodes[vertex].m_x.getX(), softBody->m_nodes[vertex].m_x.getY(), softBody->m_nodes[vertex].m_x.getZ());
+				btSoftBodyVertexData::VertexDescription desc;
+
+				// TODO: Position in the softbody might be pre-transformed
+				// or we may need to adapt for the pose.
+				//desc.setPosition( cloth.getMeshTransform()*multPoint );
+				desc.setPosition( multPoint );
+
+				float vertexInverseMass = softBody->m_nodes[vertex].m_im;
+				desc.setInverseMass(vertexInverseMass);
+				getVertexData().setVertexAt( desc, firstVertex + vertex );
+			}
+
+			// Copy triangles similarly
+			// We're assuming here that vertex indices are based on the firstVertex rather than the entire scene
+			for( int triangle = 0; triangle < numTriangles; ++triangle )
+			{
+				// Note that large array storage is relative to the array not to the cloth
+				// So we need to add firstVertex to each value
+				int vertexIndex0 = (softBody->m_faces[triangle].m_n[0] - &(softBody->m_nodes[0]));
+				int vertexIndex1 = (softBody->m_faces[triangle].m_n[1] - &(softBody->m_nodes[0]));
+				int vertexIndex2 = (softBody->m_faces[triangle].m_n[2] - &(softBody->m_nodes[0]));
+				btSoftBodyTriangleData::TriangleDescription newTriangle(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, vertexIndex2 + firstVertex);
+				getTriangleData().setTriangleAt( newTriangle, firstTriangle + triangle );
+				
+				// Increase vertex triangle counts for this triangle		
+				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex0)++;
+				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex1)++;
+				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex2)++;
+			}
+
+			int firstLink = getLinkData().getNumLinks();
+			int numLinks = softBody->m_links.size();
+			int maxLinks = numLinks;
+			
+			// Allocate space for the links
+			getLinkData().createLinks( numLinks );
+
+			// Add the links
+			for( int link = 0; link < numLinks; ++link )
+			{
+				int vertexIndex0 = softBody->m_links[link].m_n[0] - &(softBody->m_nodes[0]);
+				int vertexIndex1 = softBody->m_links[link].m_n[1] - &(softBody->m_nodes[0]);
+
+				btSoftBodyLinkData::LinkDescription newLink(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, softBody->m_links[link].m_material->m_kLST);
+				newLink.setLinkStrength(1.f);
+				getLinkData().setLinkAt(newLink, firstLink + link);
+			}
+			
+			newSoftBody->setFirstVertex( firstVertex );
+			newSoftBody->setFirstTriangle( firstTriangle );
+			newSoftBody->setNumVertices( numVertices );
+			newSoftBody->setMaxVertices( maxVertices );
+			newSoftBody->setNumTriangles( numTriangles );
+			newSoftBody->setMaxTriangles( maxTriangles );
+			newSoftBody->setFirstLink( firstLink );
+			newSoftBody->setNumLinks( numLinks );
+		}
+
+
+
+		updateConstants(0.f);
+
+
+		m_linkData.generateBatches();		
+		m_triangleData.generateBatches();
+
+		
+		// Build the shaders to match the batching parameters
+		buildShaders();
+	}
+}
+
+
+btSoftBodyLinkData &btOpenCLSoftBodySolverSIMDAware::getLinkData()
+{
+	// TODO: Consider setting link data to "changed" here
+	return m_linkData;
+}
+
+
+
+
+void btOpenCLSoftBodySolverSIMDAware::updateConstants( float timeStep )
+{			
+
+	using namespace Vectormath::Aos;
+
+	if( m_updateSolverConstants )
+	{
+		m_updateSolverConstants = false;
+
+		// Will have to redo this if we change the structure (tear, maybe) or various other possible changes
+
+		// Initialise link constants
+		const int numLinks = m_linkData.getNumLinks();
+		for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex )
+		{
+			btSoftBodyLinkData::LinkNodePair &vertices( m_linkData.getVertexPair(linkIndex) );
+			m_linkData.getRestLength(linkIndex) = length((m_vertexData.getPosition( vertices.vertex0 ) - m_vertexData.getPosition( vertices.vertex1 )));
+			float invMass0 = m_vertexData.getInverseMass(vertices.vertex0);
+			float invMass1 = m_vertexData.getInverseMass(vertices.vertex1);
+			float linearStiffness = m_linkData.getLinearStiffnessCoefficient(linkIndex);
+			float massLSC = (invMass0 + invMass1)/linearStiffness;
+			m_linkData.getMassLSC(linkIndex) = massLSC;
+			float restLength = m_linkData.getRestLength(linkIndex);
+			float restLengthSquared = restLength*restLength;
+			m_linkData.getRestLengthSquared(linkIndex) = restLengthSquared;
+		}
+	}
+
+}
+
+
+
+void btOpenCLSoftBodySolverSIMDAware::solveConstraints( float solverdt )
+{
+
+	using Vectormath::Aos::Vector3;
+	using Vectormath::Aos::Point3;
+	using Vectormath::Aos::lengthSqr;
+	using Vectormath::Aos::dot;
+
+	// Prepare links
+	int numLinks = m_linkData.getNumLinks();
+	int numVertices = m_vertexData.getNumVertices();
+
+	float kst = 1.f;
+	float ti = 0.f;
+
+
+	m_clPerClothDampingFactor.moveToGPU();
+	m_clPerClothVelocityCorrectionCoefficient.moveToGPU();
+
+
+	// Ensure data is on accelerator
+	m_linkData.moveToAccelerator();
+	m_vertexData.moveToAccelerator();
+
+	
+	//prepareLinks();	
+
+	prepareCollisionConstraints();
+
+	// Solve drift
+	for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
+	{
+
+		for( int i = 0; i < m_linkData.m_wavefrontBatchStartLengths.size(); ++i )
+		{
+			int startWave = m_linkData.m_wavefrontBatchStartLengths[i].start;
+			int numWaves = m_linkData.m_wavefrontBatchStartLengths[i].length;
+			solveLinksForPosition( startWave, numWaves, kst, ti );
+		}
+	} // for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
+
+	
+	// At this point assume that the force array is blank - we will overwrite it
+	solveCollisionsAndUpdateVelocities( 1.f/solverdt );
+}
+
+
+//////////////////////////////////////
+// Kernel dispatches
+
+
+void btOpenCLSoftBodySolverSIMDAware::solveLinksForPosition( int startWave, int numWaves, float kst, float ti )
+{
+	cl_int ciErrNum;
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,0, sizeof(int), &startWave);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,1, sizeof(int), &numWaves);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,2, sizeof(float), &kst);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,3, sizeof(float), &ti);
+	
+	
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,4, sizeof(cl_mem), &m_linkData.m_clNumBatchesAndVerticesWithinWaves.m_buffer);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,5, sizeof(cl_mem), &m_linkData.m_clWavefrontVerticesGlobalAddresses.m_buffer);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,6, sizeof(cl_mem), &m_linkData.m_clLinkVerticesLocalAddresses.m_buffer);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,7, sizeof(cl_mem), &m_linkData.m_clLinksMassLSC.m_buffer);
+
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,8, sizeof(cl_mem), &m_linkData.m_clLinksRestLengthSquared.m_buffer);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,9, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,10, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
+
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,11, WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_int2), 0);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,12, m_linkData.getMaxVerticesPerWavefront()*WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_float4), 0);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,13, m_linkData.getMaxVerticesPerWavefront()*WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_float), 0);
+
+	size_t	numWorkItems = workGroupSize*((numWaves*WAVEFRONT_SIZE + (workGroupSize-1)) / workGroupSize);
+	
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,solvePositionsFromLinksKernel,1,NULL,&numWorkItems,&workGroupSize,0,0,0);
+	
+	if( ciErrNum!= CL_SUCCESS ) 
+	{
+		btAssert( 0 &&  "enqueueNDRangeKernel(solvePositionsFromLinksKernel)");
+	}
+
+} // solveLinksForPosition
+
+void btOpenCLSoftBodySolverSIMDAware::solveCollisionsAndUpdateVelocities( float isolverdt )
+{
+	// Copy kernel parameters to GPU
+	m_vertexData.moveToAccelerator();
+	m_clPerClothFriction.moveToGPU();
+	m_clPerClothDampingFactor.moveToGPU();
+	m_clPerClothCollisionObjects.moveToGPU();
+	m_clCollisionObjectDetails.moveToGPU();
+	
+	cl_int ciErrNum;
+	int numVerts = m_vertexData.getNumVertices();
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 0, sizeof(int), &numVerts);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 1, sizeof(int), &isolverdt);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 2, sizeof(cl_mem),&m_vertexData.m_clClothIdentifier.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 3, sizeof(cl_mem),&m_vertexData.m_clVertexPreviousPosition.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 4, sizeof(cl_mem),&m_clPerClothFriction.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 5, sizeof(cl_mem),&m_clPerClothDampingFactor.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 6, sizeof(cl_mem),&m_clPerClothCollisionObjects.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 7, sizeof(cl_mem),&m_clCollisionObjectDetails.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 8, sizeof(cl_mem),&m_vertexData.m_clVertexForceAccumulator.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 9, sizeof(cl_mem),&m_vertexData.m_clVertexVelocity.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 10, sizeof(cl_mem),&m_vertexData.m_clVertexPosition.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 11, sizeof(CollisionShapeDescription)*16,0);
+	size_t	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	
+	if (numWorkItems)
+	{
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,solveCollisionsAndUpdateVelocitiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
+		
+		if( ciErrNum != CL_SUCCESS ) 
+		{
+			btAssert( 0 &&  "enqueueNDRangeKernel(solveCollisionsAndUpdateVelocitiesKernel)");
+		}
+	}
+
+} // btOpenCLSoftBodySolverSIMDAware::updateVelocitiesFromPositionsWithoutVelocities
+
+// End kernel dispatches
+/////////////////////////////////////
+
+
+
+bool btOpenCLSoftBodySolverSIMDAware::buildShaders()
+{
+	bool returnVal = true;
+
+	if( m_shadersInitialized )
+		return true;
+
+	char *wavefrontMacros = new char[256];
+
+	sprintf(
+		wavefrontMacros, 
+		"-DMAX_NUM_VERTICES_PER_WAVE=%d -DMAX_BATCHES_PER_WAVE=%d -DWAVEFRONT_SIZE=%d -DWAVEFRONT_BLOCK_MULTIPLIER=%d -DBLOCK_SIZE=%d", 
+		m_linkData.getMaxVerticesPerWavefront(),
+		m_linkData.getMaxBatchesPerWavefront(),
+		m_linkData.getWavefrontSize(),
+		WAVEFRONT_BLOCK_MULTIPLIER,
+		WAVEFRONT_BLOCK_MULTIPLIER*m_linkData.getWavefrontSize());
+	
+	updatePositionsFromVelocitiesKernel = clFunctions.compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel", "" );
+	solvePositionsFromLinksKernel = clFunctions.compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel", wavefrontMacros );
+	updateVelocitiesFromPositionsWithVelocitiesKernel = clFunctions.compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel", "" );
+	updateVelocitiesFromPositionsWithoutVelocitiesKernel = clFunctions.compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel", "" );
+	integrateKernel = clFunctions.compileCLKernelFromString( IntegrateCLString, "IntegrateKernel", "" );
+	applyForcesKernel = clFunctions.compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel", "" );
+	solveCollisionsAndUpdateVelocitiesKernel = clFunctions.compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel", "" );
+	computeBoundsKernel = clFunctions.compileCLKernelFromString( ComputeBoundsCLString, "ComputeBoundsKernel" );
+
+	// TODO: Rename to UpdateSoftBodies
+	resetNormalsAndAreasKernel = clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel", "" );
+	normalizeNormalsAndAreasKernel = clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel", "" );
+	updateSoftBodiesKernel = clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel", "" );
+
+	delete [] wavefrontMacros;
+
+	if( returnVal )
+		m_shadersInitialized = true;
+
+	return returnVal;
+}
+
+
+
+
+static Vectormath::Aos::Transform3 toTransform3( const btTransform &transform )
+{
+	Vectormath::Aos::Transform3 outTransform;
+	outTransform.setCol(0, toVector3(transform.getBasis().getColumn(0)));
+	outTransform.setCol(1, toVector3(transform.getBasis().getColumn(1)));
+	outTransform.setCol(2, toVector3(transform.getBasis().getColumn(2)));
+	outTransform.setCol(3, toVector3(transform.getOrigin()));
+	return outTransform;	
+}
+
+
+static void generateBatchesOfWavefronts( btAlignedObjectArray < btAlignedObjectArray <int> > &linksForWavefronts, btSoftBodyLinkData &linkData, int numVertices, btAlignedObjectArray < btAlignedObjectArray <int> > &wavefrontBatches )
+{
+	// A per-batch map of truth values stating whether a given vertex is in that batch
+	// This allows us to significantly optimize the batching
+	btAlignedObjectArray <btAlignedObjectArray<bool> > mapOfVerticesInBatches;
+
+	for( int waveIndex = 0; waveIndex < linksForWavefronts.size(); ++waveIndex )
+	{
+		btAlignedObjectArray <int> &wavefront( linksForWavefronts[waveIndex] );
+
+		int batch = 0;
+		bool placed = false;
+		while( batch < wavefrontBatches.size() && !placed )
+		{
+			// Test the current batch, see if this wave shares any vertex with the waves in the batch
+			bool foundSharedVertex = false;
+			for( int link = 0; link < wavefront.size(); ++link )
+			{
+				btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
+				if( (mapOfVerticesInBatches[batch])[vertices.vertex0] || (mapOfVerticesInBatches[batch])[vertices.vertex1] )
+				{
+					foundSharedVertex = true;
+				}
+			}
+
+			if( !foundSharedVertex )
+			{
+				wavefrontBatches[batch].push_back( waveIndex );	
+				// Insert vertices into this batch too
+				for( int link = 0; link < wavefront.size(); ++link )
+				{
+					btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
+					(mapOfVerticesInBatches[batch])[vertices.vertex0] = true;
+					(mapOfVerticesInBatches[batch])[vertices.vertex1] = true;
+				}
+				placed = true;
+			}
+			batch++;
+		}
+		if( batch == wavefrontBatches.size() && !placed )
+		{
+			wavefrontBatches.resize( batch + 1 );
+			wavefrontBatches[batch].push_back( waveIndex );
+
+			// And resize map as well
+			mapOfVerticesInBatches.resize( batch + 1 );
+			
+			// Resize maps with total number of vertices
+			mapOfVerticesInBatches[batch].resize( numVertices, false );
+
+			// Insert vertices into this batch too
+			for( int link = 0; link < wavefront.size(); ++link )
+			{
+				btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
+				(mapOfVerticesInBatches[batch])[vertices.vertex0] = true;
+				(mapOfVerticesInBatches[batch])[vertices.vertex1] = true;
+			}
+		}
+	}
+	mapOfVerticesInBatches.clear();
+}
+
+// Function to remove an object from a vector maintaining correct ordering of the vector
+template< typename T > static void removeFromVector( btAlignedObjectArray< T > &vectorToUpdate, int indexToRemove )
+{
+	int currentSize = vectorToUpdate.size();
+	for( int i = indexToRemove; i < (currentSize-1); ++i )
+	{
+		vectorToUpdate[i] = vectorToUpdate[i+1];
+	}
+	if( currentSize > 0 )
+		vectorToUpdate.resize( currentSize - 1 );
+}
+
+/**
+ * Insert element into vectorToUpdate at index index.
+ */
+template< typename T > static void insertAtIndex( btAlignedObjectArray< T > &vectorToUpdate, int index, T element )
+{
+	vectorToUpdate.resize( vectorToUpdate.size() + 1 );
+	for( int i = (vectorToUpdate.size() - 1); i > index; --i )
+	{
+		vectorToUpdate[i] = vectorToUpdate[i-1];
+	}
+	vectorToUpdate[index] = element;
+}
+
+/** 
+ * Insert into btAlignedObjectArray assuming the array is ordered and maintaining both ordering and uniqueness.
+ * ie it treats vectorToUpdate as an ordered set.
+ */
+template< typename T > static void insertUniqueAndOrderedIntoVector( btAlignedObjectArray<T> &vectorToUpdate, T element )
+{
+	int index = 0;
+	while( index < vectorToUpdate.size() && vectorToUpdate[index] < element )
+	{
+		index++;
+	}
+	if( index == vectorToUpdate.size() || vectorToUpdate[index] != element )
+		insertAtIndex( vectorToUpdate, index, element );
+}
+
+static void generateLinksPerVertex( int numVertices, btSoftBodyLinkData &linkData, btAlignedObjectArray< int > &listOfLinksPerVertex, btAlignedObjectArray <int> &numLinksPerVertex, int &maxLinks )
+{
+	for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
+	{
+		btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
+		numLinksPerVertex[nodes.vertex0]++;
+		numLinksPerVertex[nodes.vertex1]++;
+	}
+	int maxLinksPerVertex = 0;
+	for( int vertexIndex = 0; vertexIndex < numVertices; ++vertexIndex )
+	{
+		maxLinksPerVertex = btMax(numLinksPerVertex[vertexIndex], maxLinksPerVertex);
+	}
+	maxLinks = maxLinksPerVertex;
+
+	btAlignedObjectArray< int > linksFoundPerVertex;
+	linksFoundPerVertex.resize( numVertices, 0 );
+
+	listOfLinksPerVertex.resize( maxLinksPerVertex * numVertices );
+
+	for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
+	{
+		btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
+		{
+			// Do vertex 0
+			int vertexIndex = nodes.vertex0;
+			int linkForVertex = linksFoundPerVertex[nodes.vertex0];
+			int linkAddress = vertexIndex * maxLinksPerVertex + linkForVertex;
+
+			listOfLinksPerVertex[linkAddress] = linkIndex;
+
+			linksFoundPerVertex[nodes.vertex0] = linkForVertex + 1;
+		}
+		{
+			// Do vertex 1
+			int vertexIndex = nodes.vertex1;
+			int linkForVertex = linksFoundPerVertex[nodes.vertex1];
+			int linkAddress = vertexIndex * maxLinksPerVertex + linkForVertex;
+
+			listOfLinksPerVertex[linkAddress] = linkIndex;
+
+			linksFoundPerVertex[nodes.vertex1] = linkForVertex + 1;
+		}
+	}
+}
+
+static void computeBatchingIntoWavefronts( 
+	btSoftBodyLinkData &linkData, 
+	int wavefrontSize, 
+	int linksPerWorkItem, 
+	int maxLinksPerWavefront, 
+	btAlignedObjectArray < btAlignedObjectArray <int> > &linksForWavefronts, 
+	btAlignedObjectArray< btAlignedObjectArray < btAlignedObjectArray <int> > > &batchesWithinWaves, /* wave, batch, links in batch */
+	btAlignedObjectArray< btAlignedObjectArray< int > > &verticesForWavefronts /* wavefront, vertex */
+	)
+{
+	
+
+	// Attempt generation of larger batches of links.
+	btAlignedObjectArray< bool > processedLink;
+	processedLink.resize( linkData.getNumLinks() );
+	btAlignedObjectArray< int > listOfLinksPerVertex;
+	int maxLinksPerVertex = 0;
+
+	// Count num vertices
+	int numVertices = 0;
+	for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
+	{
+		btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
+		numVertices = btMax( numVertices, nodes.vertex0 + 1 );
+		numVertices = btMax( numVertices, nodes.vertex1 + 1 );
+	}
+
+	// Need list of links per vertex
+	// Compute valence of each vertex
+	btAlignedObjectArray <int> numLinksPerVertex;
+	numLinksPerVertex.resize(0);
+	numLinksPerVertex.resize( numVertices, 0 );
+
+	generateLinksPerVertex( numVertices, linkData, listOfLinksPerVertex, numLinksPerVertex, maxLinksPerVertex );
+
+	if (!numVertices)
+		return;
+
+	for( int vertex = 0; vertex < 10; ++vertex )
+	{
+		for( int link = 0; link < numLinksPerVertex[vertex]; ++link )
+		{
+			int linkAddress = vertex * maxLinksPerVertex + link;
+		}
+	}
+
+
+	// At this point we know what links we have for each vertex so we can start batching
+	
+	// We want a vertex to start with, let's go with 0
+	int currentVertex = 0;
+	int linksProcessed = 0;
+
+	btAlignedObjectArray <int> verticesToProcess;
+
+	while( linksProcessed < linkData.getNumLinks() )
+	{
+		// Next wavefront
+		int nextWavefront = linksForWavefronts.size();
+		linksForWavefronts.resize( nextWavefront + 1 );
+		btAlignedObjectArray <int> &linksForWavefront(linksForWavefronts[nextWavefront]);
+		verticesForWavefronts.resize( nextWavefront + 1 );
+		btAlignedObjectArray<int> &vertexSet( verticesForWavefronts[nextWavefront] );
+
+		linksForWavefront.resize(0);
+
+		// Loop to find enough links to fill the wavefront
+		// Stopping if we either run out of links, or fill it
+		while( linksProcessed < linkData.getNumLinks() && linksForWavefront.size() < maxLinksPerWavefront )
+		{
+			// Go through the links for the current vertex
+			for( int link = 0; link < numLinksPerVertex[currentVertex] && linksForWavefront.size() < maxLinksPerWavefront; ++link )
+			{
+				int linkAddress = currentVertex * maxLinksPerVertex + link;
+				int linkIndex = listOfLinksPerVertex[linkAddress];
+				
+				// If we have not already processed this link, add it to the wavefront
+				// Claim it as another processed link
+				// Add the vertex at the far end to the list of vertices to process.
+				if( !processedLink[linkIndex] )
+				{
+					linksForWavefront.push_back( linkIndex );
+					linksProcessed++;
+					processedLink[linkIndex] = true;
+					int v0 = linkData.getVertexPair(linkIndex).vertex0;
+					int v1 = linkData.getVertexPair(linkIndex).vertex1;
+					if( v0 == currentVertex )
+						verticesToProcess.push_back( v1 );
+					else
+						verticesToProcess.push_back( v0 );
+				}
+			}
+			if( verticesToProcess.size() > 0 )
+			{
+				// Get the element on the front of the queue and remove it
+				currentVertex = verticesToProcess[0];
+				removeFromVector( verticesToProcess, 0 );
+			} else {		
+				// If we've not yet processed all the links, find the first unprocessed one
+				// and select one of its vertices as the current vertex
+				if( linksProcessed < linkData.getNumLinks() )
+				{
+					int searchLink = 0;
+					while( processedLink[searchLink] )
+						searchLink++;
+					currentVertex = linkData.getVertexPair(searchLink).vertex0;
+				}	
+			}
+		}
+
+		// We have either finished or filled a wavefront
+		for( int link = 0; link < linksForWavefront.size(); ++link )
+		{
+			int v0 = linkData.getVertexPair( linksForWavefront[link] ).vertex0;
+			int v1 = linkData.getVertexPair( linksForWavefront[link] ).vertex1;
+			insertUniqueAndOrderedIntoVector( vertexSet, v0 );
+			insertUniqueAndOrderedIntoVector( vertexSet, v1 );
+		}
+		// Iterate over links mapped to the wave and batch those
+		// We can run a batch on each cycle trivially
+		
+		batchesWithinWaves.resize( batchesWithinWaves.size() + 1 );
+		btAlignedObjectArray < btAlignedObjectArray <int> > &batchesWithinWave( batchesWithinWaves[batchesWithinWaves.size()-1] );
+		
+
+		for( int link = 0; link < linksForWavefront.size(); ++link )
+		{
+			int linkIndex = linksForWavefront[link];
+			btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( linkIndex );
+			
+			int batch = 0;
+			bool placed = false;
+			while( batch < batchesWithinWave.size() && !placed )
+			{
+				bool foundSharedVertex = false;
+				if( batchesWithinWave[batch].size() >= wavefrontSize )
+				{
+					// If we have already filled this batch, move on to another
+					foundSharedVertex = true;
+				} else {
+					for( int link2 = 0; link2 < batchesWithinWave[batch].size(); ++link2 )
+					{
+						btSoftBodyLinkData::LinkNodePair vertices2 = linkData.getVertexPair( (batchesWithinWave[batch])[link2] );
+
+						if( vertices.vertex0 == vertices2.vertex0 ||
+							vertices.vertex1 == vertices2.vertex0 ||
+							vertices.vertex0 == vertices2.vertex1 ||
+							vertices.vertex1 == vertices2.vertex1 )
+						{
+							foundSharedVertex = true;
+							break;
+						}
+					}
+				}
+				if( !foundSharedVertex )
+				{
+					batchesWithinWave[batch].push_back( linkIndex );
+					placed = true;
+				} else {
+					++batch;
+				}
+			}
+			if( batch == batchesWithinWave.size() && !placed )
+			{
+				batchesWithinWave.resize( batch + 1 );
+				batchesWithinWave[batch].push_back( linkIndex );
+			}
+		}
+		
+	}
+
+}
+
+void btSoftBodyLinkDataOpenCLSIMDAware::generateBatches()
+{
+	btAlignedObjectArray < btAlignedObjectArray <int> > linksForWavefronts;
+	btAlignedObjectArray < btAlignedObjectArray <int> > wavefrontBatches;
+	btAlignedObjectArray< btAlignedObjectArray < btAlignedObjectArray <int> > > batchesWithinWaves;
+	btAlignedObjectArray< btAlignedObjectArray< int > > verticesForWavefronts; // wavefronts, vertices in wavefront as an ordered set
+
+	// Group the links into wavefronts
+	computeBatchingIntoWavefronts( *this, m_wavefrontSize, m_linksPerWorkItem, m_maxLinksPerWavefront, linksForWavefronts, batchesWithinWaves, verticesForWavefronts );
+
+
+	// Batch the wavefronts
+	generateBatchesOfWavefronts( linksForWavefronts, *this, m_maxVertex, wavefrontBatches );
+
+	m_numWavefronts = linksForWavefronts.size();
+
+	// At this point we have a description of which links we need to process in each wavefront
+
+	// First correctly fill the batch ranges vector
+	int numBatches = wavefrontBatches.size();
+	m_wavefrontBatchStartLengths.resize(0);
+	int prefixSum = 0;
+	for( int batchIndex = 0; batchIndex < numBatches; ++batchIndex )
+	{
+		int wavesInBatch = wavefrontBatches[batchIndex].size();
+		int nextPrefixSum = prefixSum + wavesInBatch;
+		m_wavefrontBatchStartLengths.push_back( BatchPair( prefixSum, nextPrefixSum - prefixSum ) );
+
+		prefixSum += wavesInBatch;
+	}
+	
+	// Also find max number of batches within a wave
+	m_maxBatchesWithinWave = 0;
+	m_maxVerticesWithinWave = 0;
+	m_numBatchesAndVerticesWithinWaves.resize( m_numWavefronts );
+	for( int waveIndex = 0; waveIndex < m_numWavefronts; ++waveIndex )
+	{
+		// See if the number of batches in this wave is greater than the current maxium
+		int batchesInCurrentWave = batchesWithinWaves[waveIndex].size();
+		int verticesInCurrentWave = verticesForWavefronts[waveIndex].size();
+		m_maxBatchesWithinWave = btMax( batchesInCurrentWave, m_maxBatchesWithinWave );
+		m_maxVerticesWithinWave = btMax( verticesInCurrentWave, m_maxVerticesWithinWave );
+	}
+	
+	// Add padding values both for alignment and as dudd addresses within LDS to compute junk rather than branch around
+	m_maxVerticesWithinWave = 16*((m_maxVerticesWithinWave/16)+2);
+
+	// Now we know the maximum number of vertices per-wave we can resize the global vertices array
+	m_wavefrontVerticesGlobalAddresses.resize( m_maxVerticesWithinWave * m_numWavefronts );
+
+	// Grab backup copies of all the link data arrays for the sorting process
+	btAlignedObjectArray<btSoftBodyLinkData::LinkNodePair>				m_links_Backup(m_links);
+	btAlignedObjectArray<float>											m_linkStrength_Backup(m_linkStrength);
+	btAlignedObjectArray<float>											m_linksMassLSC_Backup(m_linksMassLSC);
+	btAlignedObjectArray<float>											m_linksRestLengthSquared_Backup(m_linksRestLengthSquared);
+	//btAlignedObjectArray<Vectormath::Aos::Vector3>						m_linksCLength_Backup(m_linksCLength);
+	//btAlignedObjectArray<float>											m_linksLengthRatio_Backup(m_linksLengthRatio);
+	btAlignedObjectArray<float>											m_linksRestLength_Backup(m_linksRestLength);
+	btAlignedObjectArray<float>											m_linksMaterialLinearStiffnessCoefficient_Backup(m_linksMaterialLinearStiffnessCoefficient);
+
+	// Resize to a wavefront sized batch per batch per wave so we get perfectly coherent memory accesses.
+	m_links.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
+	m_linkVerticesLocalAddresses.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
+	m_linkStrength.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
+	m_linksMassLSC.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
+	m_linksRestLengthSquared.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
+	m_linksRestLength.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
+	m_linksMaterialLinearStiffnessCoefficient.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );	
+		
+	// Then re-order links into wavefront blocks
+
+	// Total number of wavefronts moved. This will decide the ordering of sorted wavefronts.
+	int wavefrontCount = 0;
+
+	// Iterate over batches of wavefronts, then wavefronts in the batch
+	for( int batchIndex = 0; batchIndex < numBatches; ++batchIndex )
+	{
+		btAlignedObjectArray <int> &batch( wavefrontBatches[batchIndex] );
+		int wavefrontsInBatch = batch.size();
+
+		
+		for( int wavefrontIndex = 0; wavefrontIndex < wavefrontsInBatch; ++wavefrontIndex )
+		{	
+
+			int originalWavefrontIndex = batch[wavefrontIndex];
+			btAlignedObjectArray< int > &wavefrontVertices( verticesForWavefronts[originalWavefrontIndex] );
+			int verticesUsedByWavefront = wavefrontVertices.size();
+
+			// Copy the set of vertices into the correctly structured array for use on the device
+			// Fill the non-vertices with -1s
+			// so we can mask out those reads
+			for( int vertex = 0; vertex < verticesUsedByWavefront; ++vertex )
+			{
+				m_wavefrontVerticesGlobalAddresses[m_maxVerticesWithinWave * wavefrontCount + vertex] = wavefrontVertices[vertex];
+			}
+			for( int vertex = verticesUsedByWavefront; vertex < m_maxVerticesWithinWave; ++vertex )
+			{
+				m_wavefrontVerticesGlobalAddresses[m_maxVerticesWithinWave * wavefrontCount + vertex] = -1;
+			}
+
+			// Obtain the set of batches within the current wavefront
+			btAlignedObjectArray < btAlignedObjectArray <int> > &batchesWithinWavefront( batchesWithinWaves[originalWavefrontIndex] );
+			// Set the size of the batches for use in the solver, correctly ordered
+			NumBatchesVerticesPair batchesAndVertices;
+			batchesAndVertices.numBatches = batchesWithinWavefront.size();
+			batchesAndVertices.numVertices = verticesUsedByWavefront;
+			m_numBatchesAndVerticesWithinWaves[wavefrontCount] = batchesAndVertices;
+			
+
+			// Now iterate over batches within the wavefront to structure the links correctly
+			for( int wavefrontBatch = 0; wavefrontBatch < batchesWithinWavefront.size(); ++wavefrontBatch )
+			{
+				btAlignedObjectArray <int> &linksInBatch( batchesWithinWavefront[wavefrontBatch] );
+				int wavefrontBatchSize = linksInBatch.size();
+
+				int batchAddressInTarget = m_maxBatchesWithinWave * m_wavefrontSize * wavefrontCount + m_wavefrontSize * wavefrontBatch;
+
+				for( int linkIndex = 0; linkIndex < wavefrontBatchSize; ++linkIndex )
+				{
+					int originalLinkAddress = linksInBatch[linkIndex];
+					// Reorder simple arrays trivially
+					m_links[batchAddressInTarget + linkIndex] = m_links_Backup[originalLinkAddress];
+					m_linkStrength[batchAddressInTarget + linkIndex] = m_linkStrength_Backup[originalLinkAddress];
+					m_linksMassLSC[batchAddressInTarget + linkIndex] = m_linksMassLSC_Backup[originalLinkAddress];
+					m_linksRestLengthSquared[batchAddressInTarget + linkIndex] = m_linksRestLengthSquared_Backup[originalLinkAddress];
+					m_linksRestLength[batchAddressInTarget + linkIndex] = m_linksRestLength_Backup[originalLinkAddress];
+					m_linksMaterialLinearStiffnessCoefficient[batchAddressInTarget + linkIndex] = m_linksMaterialLinearStiffnessCoefficient_Backup[originalLinkAddress];
+
+					// The local address is more complicated. We need to work out where a given vertex will end up
+					// by searching the set of vertices for this link and using the index as the local address
+					btSoftBodyLinkData::LinkNodePair localPair;
+					btSoftBodyLinkData::LinkNodePair globalPair = m_links[batchAddressInTarget + linkIndex];
+					localPair.vertex0 = wavefrontVertices.findLinearSearch( globalPair.vertex0 );
+					localPair.vertex1 = wavefrontVertices.findLinearSearch( globalPair.vertex1 );
+					m_linkVerticesLocalAddresses[batchAddressInTarget + linkIndex] = localPair;
+				}
+				for( int linkIndex = wavefrontBatchSize; linkIndex < m_wavefrontSize; ++linkIndex )
+				{
+					// Put 0s into these arrays for padding for cleanliness
+					m_links[batchAddressInTarget + linkIndex] = btSoftBodyLinkData::LinkNodePair(0, 0);
+					m_linkStrength[batchAddressInTarget + linkIndex] = 0.f;
+					m_linksMassLSC[batchAddressInTarget + linkIndex] = 0.f;
+					m_linksRestLengthSquared[batchAddressInTarget + linkIndex] = 0.f;
+					m_linksRestLength[batchAddressInTarget + linkIndex] = 0.f;
+					m_linksMaterialLinearStiffnessCoefficient[batchAddressInTarget + linkIndex] = 0.f;
+
+
+					// For local addresses of junk data choose a set of addresses just above the range of valid ones 
+					// and cycling tyhrough % 16 so that we don't have bank conficts between all dud addresses
+					// The valid addresses will do scatter and gather in the valid range, the junk ones should happily work
+					// off the end of that range so we need no control
+					btSoftBodyLinkData::LinkNodePair localPair;
+					localPair.vertex0 = verticesUsedByWavefront + (linkIndex % 16);
+					localPair.vertex1 = verticesUsedByWavefront + (linkIndex % 16);
+					m_linkVerticesLocalAddresses[batchAddressInTarget + linkIndex] = localPair;
+				}
+
+			}
+
+			
+			wavefrontCount++;
+		}
+
+	
+	}
+
+} // void btSoftBodyLinkDataDX11SIMDAware::generateBatches()
+
+
+
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h
new file mode 100644
index 000000000..db4f2ae8b
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h
@@ -0,0 +1,82 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SOFT_BODY_SOLVER_OPENCL_SIMDAWARE_H
+#define BT_SOFT_BODY_SOLVER_OPENCL_SIMDAWARE_H
+
+#include "stddef.h" //for size_t
+#include "vectormath/vmInclude.h"
+
+#include "btSoftBodySolver_OpenCL.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
+#include "btSoftBodySolverLinkData_OpenCLSIMDAware.h"
+#include "btSoftBodySolverVertexData_OpenCL.h"
+#include "btSoftBodySolverTriangleData_OpenCL.h"
+
+
+
+
+
+class btOpenCLSoftBodySolverSIMDAware : public btOpenCLSoftBodySolver
+{
+protected:
+	
+
+	btSoftBodyLinkDataOpenCLSIMDAware m_linkData;
+
+
+	bool m_shadersInitialized;
+
+
+	bool buildShaders();
+
+
+	void updateConstants( float timeStep );
+
+	float computeTriangleArea( 
+		const Vectormath::Aos::Point3 &vertex0,
+		const Vectormath::Aos::Point3 &vertex1,
+		const Vectormath::Aos::Point3 &vertex2 );
+
+
+	//////////////////////////////////////
+	// Kernel dispatches
+	void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
+	
+	void solveCollisionsAndUpdateVelocities( float isolverdt );
+	// End kernel dispatches
+	/////////////////////////////////////
+
+public:
+	btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue,cl_context	ctx);
+
+	virtual ~btOpenCLSoftBodySolverSIMDAware();
+
+	virtual SolverTypes getSolverType() const
+	{
+		return CL_SIMD_SOLVER;
+	}
+
+
+	virtual btSoftBodyLinkData &getLinkData();
+
+
+	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);
+
+	virtual void solveConstraints( float solverdt );
+
+}; // btOpenCLSoftBodySolverSIMDAware
+
+#endif // #ifndef BT_SOFT_BODY_SOLVER_OPENCL_SIMDAWARE_H