added missing GPU cloth simulation files and DX11/OpenCL kernels

Thanks to Cameron Hart for the report, see Issue 486
2011-02-28 05:29:54 +00:00
parent a522cb98d9
commit 8cb14e178e
19 changed files with 3496 additions and 0 deletions
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/ComputeBounds.hlsl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/ComputeBounds.hlsl
@@ -0,0 +1,83 @@
 MSTRINGIFY(
 cbuffer ComputeBoundsCB : register( b0 )
 {
 	int numNodes;
 	int numSoftBodies;
 	int padding1;
 	int padding2;
 };
 // Node indices for each link
 StructuredBuffer<int> g_vertexClothIdentifier : register( t0 );
 StructuredBuffer<float4> g_vertexPositions : register( t1 );
 RWStructuredBuffer<uint4> g_clothMinBounds : register( u0 );
 RWStructuredBuffer<uint4> g_clothMaxBounds : register( u1 );
 groupshared uint4 clothMinBounds[256];
 groupshared uint4 clothMaxBounds[256];
 [numthreads(128, 1, 1)]
 void 
 ComputeBoundsKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
 {
 	const unsigned int UINT_MAX = 0xffffffff;
 	// Init min and max bounds arrays
 	if( GTid.x < numSoftBodies )
 	{
 		clothMinBounds[GTid.x] = uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
 		clothMaxBounds[GTid.x] = uint4(0,0,0,0);
 	}
 	AllMemoryBarrierWithGroupSync();
 	int nodeID = DTid.x;
 	if( nodeID < numNodes )
 	{	
 		int clothIdentifier = g_vertexClothIdentifier[nodeID];
 		if( clothIdentifier >= 0 )
 		{
 			float3 position = g_vertexPositions[nodeID].xyz;
 			// Reinterpret position as uint
 			uint3 positionUInt = uint3(asuint(position.x), asuint(position.y), asuint(position.z));
 			// Invert sign bit of positives and whole of negatives to allow comparison as unsigned ints
 			//positionUInt.x ^= uint((-int(positionUInt.x >> 31) | 0x80000000));
 			//positionUInt.y ^= uint((-int(positionUInt.y >> 31) | 0x80000000));
 			//positionUInt.z ^= uint((-int(positionUInt.z >> 31) | 0x80000000));
 			positionUInt.x ^= (1+~(positionUInt.x >> 31) | 0x80000000);
 			positionUInt.y ^= (1+~(positionUInt.y >> 31) | 0x80000000);		
 			positionUInt.z ^= (1+~(positionUInt.z >> 31) | 0x80000000);
 			// Min/max with the LDS values
 			InterlockedMin(clothMinBounds[clothIdentifier].x, positionUInt.x);
 			InterlockedMin(clothMinBounds[clothIdentifier].y, positionUInt.y);
 			InterlockedMin(clothMinBounds[clothIdentifier].z, positionUInt.z);
 			InterlockedMax(clothMaxBounds[clothIdentifier].x, positionUInt.x);
 			InterlockedMax(clothMaxBounds[clothIdentifier].y, positionUInt.y);
 			InterlockedMax(clothMaxBounds[clothIdentifier].z, positionUInt.z);
 		}
 	}
 	AllMemoryBarrierWithGroupSync();
 	// Use global atomics to update the global versions of the data
 	if( GTid.x < numSoftBodies )
 	{
 		InterlockedMin(g_clothMinBounds[GTid.x].x, clothMinBounds[GTid.x].x);
 		InterlockedMin(g_clothMinBounds[GTid.x].y, clothMinBounds[GTid.x].y);
 		InterlockedMin(g_clothMinBounds[GTid.x].z, clothMinBounds[GTid.x].z);
 		InterlockedMax(g_clothMaxBounds[GTid.x].x, clothMaxBounds[GTid.x].x);		
 		InterlockedMax(g_clothMaxBounds[GTid.x].y, clothMaxBounds[GTid.x].y);
 		InterlockedMax(g_clothMaxBounds[GTid.x].z, clothMaxBounds[GTid.x].z);
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocities.hlsl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocities.hlsl
@@ -0,0 +1,170 @@
 MSTRINGIFY(
 cbuffer SolvePositionsFromLinksKernelCB : register( b0 )
 {
 	unsigned int numNodes;
 	float isolverdt;
 	int padding0;
 	int padding1;
 };
 struct CollisionObjectIndices
 {
 	int firstObject;
 	int endObject;
 };
 struct CollisionShapeDescription
 {
 	float4x4 shapeTransform;
 	float4 linearVelocity;
 	float4 angularVelocity;
 	int softBodyIdentifier;
 	int collisionShapeType;
 	// Shape information
 	// Compressed from the union
 	float radius;
 	float halfHeight;
 	float margin;
 	float friction;
 	int padding0;
 	int padding1;
 };
 // From btBroadphaseProxy.h
 static const int CAPSULE_SHAPE_PROXYTYPE = 10;
 // Node indices for each link
 StructuredBuffer<int> g_vertexClothIdentifier : register( t0 );
 StructuredBuffer<float4> g_vertexPreviousPositions : register( t1 );
 StructuredBuffer<float> g_perClothFriction : register( t2 );
 StructuredBuffer<float> g_clothDampingFactor : register( t3 );
 StructuredBuffer<CollisionObjectIndices> g_perClothCollisionObjectIndices : register( t4 );
 StructuredBuffer<CollisionShapeDescription> g_collisionObjectDetails : register( t5 );
 RWStructuredBuffer<float4> g_vertexForces : register( u0 );
 RWStructuredBuffer<float4> g_vertexVelocities : register( u1 );
 RWStructuredBuffer<float4> g_vertexPositions : register( u2 );
 [numthreads(128, 1, 1)]
 void 
 SolveCollisionsAndUpdateVelocitiesKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
 {
 	int nodeID = DTid.x;
 	float3 forceOnVertex = float3(0.f, 0.f, 0.f);
 	if( DTid.x < numNodes )
 	{	
 		int clothIdentifier = g_vertexClothIdentifier[nodeID];
 		float4 position = float4(g_vertexPositions[nodeID].xyz, 1.f);
 		float4 previousPosition = float4(g_vertexPreviousPositions[nodeID].xyz, 1.f);
 		float3 velocity;
 		float clothFriction = g_perClothFriction[clothIdentifier];
 		float dampingFactor = g_clothDampingFactor[clothIdentifier];
 		float velocityCoefficient = (1.f - dampingFactor);		
 		CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
 		if( collisionObjectIndices.firstObject != collisionObjectIndices.endObject )
 		{
 			velocity = float3(15, 0, 0);
 			// We have some possible collisions to deal with
 			for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision )
 			{
 				CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision];
 				float colliderFriction = shapeDescription.friction;
 				if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
 				{
 					// Colliding with a capsule
 					float capsuleHalfHeight = shapeDescription.halfHeight;
 					float capsuleRadius = shapeDescription.radius;
 					float capsuleMargin = shapeDescription.margin;
 					float4x4 worldTransform = shapeDescription.shapeTransform;
 					float4 c1 = float4(0.f, -capsuleHalfHeight, 0.f, 1.f); 
 					float4 c2 = float4(0.f, +capsuleHalfHeight, 0.f, 1.f);
 					float4 worldC1 = mul(worldTransform, c1);
 					float4 worldC2 = mul(worldTransform, c2);
 					float3 segment = (worldC2 - worldC1).xyz;
 					// compute distance of tangent to vertex along line segment in capsule
 					float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) );
 					float4 closestPoint = (worldC1 + float4(segment * distanceAlongSegment, 0.f));
 					float distanceFromLine = length(position - closestPoint);
 					float distanceFromC1 = length(worldC1 - position);
 					float distanceFromC2 = length(worldC2 - position);
 					// Final distance from collision, point to push from, direction to push in
 					// for impulse force
 					float dist;
 					float3 normalVector;
 					if( distanceAlongSegment < 0 )
 					{
 						dist = distanceFromC1;
 						normalVector = normalize(position - worldC1).xyz;
 					} else if( distanceAlongSegment > 1.f ) {
 						dist = distanceFromC2;
 						normalVector = normalize(position - worldC2).xyz;	
 					} else {
 						dist = distanceFromLine;
 						normalVector = normalize(position - closestPoint).xyz;
 					}
 					float3 colliderLinearVelocity = shapeDescription.linearVelocity.xyz;
 					float3 colliderAngularVelocity = shapeDescription.angularVelocity.xyz;
 					float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz - worldTransform._m03_m13_m23);
 					float minDistance = capsuleRadius + capsuleMargin;
 					// In case of no collision, this is the value of velocity
 					velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
 					// Check for a collision
 					if( dist < minDistance )
 					{
 						// Project back to surface along normal
 						position = position + float4((minDistance - dist)*normalVector*0.9, 0.f);
 						velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
 						float3 relativeVelocity = velocity - velocityOfSurfacePoint;
 						float3 p1 = normalize(cross(normalVector, segment));
 						float3 p2 = normalize(cross(p1, normalVector));
 						// Full friction is sum of velocities in each direction of plane
 						float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2);
 						// Real friction is peak friction corrected by friction coefficients
 						frictionVector = frictionVector * (colliderFriction*clothFriction);
 						float approachSpeed = dot(relativeVelocity, normalVector);
 						if( approachSpeed <= 0.0 )
 							forceOnVertex -= frictionVector;
 					}
 				}
 			}
 		} else {
 			// Update velocity	
 			float3 difference = position.xyz - previousPosition.xyz;
 			velocity = difference*velocityCoefficient*isolverdt;			
 		}
 		g_vertexVelocities[nodeID] = float4(velocity, 0.f);	
 		// Update external force
 		g_vertexForces[nodeID] = float4(forceOnVertex, 0.f);
 		g_vertexPositions[nodeID] = float4(position.xyz, 0.f);
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocitiesSIMDBatched.hlsl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocitiesSIMDBatched.hlsl
@@ -0,0 +1,191 @@
 MSTRINGIFY(
 cbuffer SolvePositionsFromLinksKernelCB : register( b0 )
 {
 	unsigned int numNodes;
 	float isolverdt;
 	int padding0;
 	int padding1;
 };
 struct CollisionObjectIndices
 {
 	int firstObject;
 	int endObject;
 };
 struct CollisionShapeDescription
 {
 	float4x4 shapeTransform;
 	float4 linearVelocity;
 	float4 angularVelocity;
 	int softBodyIdentifier;
 	int collisionShapeType;
 	// Shape information
 	// Compressed from the union
 	float radius;
 	float halfHeight;
 	float margin;
 	float friction;
 	int padding0;
 	int padding1;
 };
 // From btBroadphaseProxy.h
 static const int CAPSULE_SHAPE_PROXYTYPE = 10;
 // Node indices for each link
 StructuredBuffer<int> g_vertexClothIdentifier : register( t0 );
 StructuredBuffer<float4> g_vertexPreviousPositions : register( t1 );
 StructuredBuffer<float> g_perClothFriction : register( t2 );
 StructuredBuffer<float> g_clothDampingFactor : register( t3 );
 StructuredBuffer<CollisionObjectIndices> g_perClothCollisionObjectIndices : register( t4 );
 StructuredBuffer<CollisionShapeDescription> g_collisionObjectDetails : register( t5 );
 RWStructuredBuffer<float4> g_vertexForces : register( u0 );
 RWStructuredBuffer<float4> g_vertexVelocities : register( u1 );
 RWStructuredBuffer<float4> g_vertexPositions : register( u2 );
 // A buffer of local collision shapes
 // TODO: Iterate to support more than 16
 groupshared CollisionShapeDescription localCollisionShapes[16];
 [numthreads(128, 1, 1)]
 void 
 SolveCollisionsAndUpdateVelocitiesKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
 {
 	int nodeID = DTid.x;
 	float3 forceOnVertex = float3(0.f, 0.f, 0.f);
 	int clothIdentifier = g_vertexClothIdentifier[nodeID];
 	float4 position = float4(g_vertexPositions[nodeID].xyz, 1.f);
 	float4 previousPosition = float4(g_vertexPreviousPositions[nodeID].xyz, 1.f);
 	float3 velocity;
 	float clothFriction = g_perClothFriction[clothIdentifier];
 	float dampingFactor = g_clothDampingFactor[clothIdentifier];
 	float velocityCoefficient = (1.f - dampingFactor);		
 	CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
 	int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
 	if( numObjects > 0 )
 	{
 		// We have some possible collisions to deal with
 		// First load all of the collision objects into LDS
 		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
 		if( GTid.x < numObjects )
 		{
 			localCollisionShapes[GTid.x] = g_collisionObjectDetails[ collisionObjectIndices.firstObject + GTid.x ];
 		}
 	}
 	// Safe as the vertices are padded so that not more than one soft body is in a group
 	AllMemoryBarrierWithGroupSync();
 	// Annoyingly, even though I know the flow control is not varying, the compiler will not let me skip this
 	if( numObjects > 0 )
 	{
 		velocity = float3(0, 0, 0);
 		// We have some possible collisions to deal with
 		for( int collision = 0; collision < numObjects; ++collision )
 		{
 			CollisionShapeDescription shapeDescription = localCollisionShapes[collision];
 			float colliderFriction = shapeDescription.friction;
 			if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
 			{
 				// Colliding with a capsule
 				float capsuleHalfHeight = localCollisionShapes[collision].halfHeight;
 				float capsuleRadius = localCollisionShapes[collision].radius;
 				float capsuleMargin = localCollisionShapes[collision].margin;
 				float4x4 worldTransform = localCollisionShapes[collision].shapeTransform;
 				float4 c1 = float4(0.f, -capsuleHalfHeight, 0.f, 1.f); 
 				float4 c2 = float4(0.f, +capsuleHalfHeight, 0.f, 1.f);
 				float4 worldC1 = mul(worldTransform, c1);
 				float4 worldC2 = mul(worldTransform, c2);
 				float3 segment = (worldC2 - worldC1).xyz;
 				// compute distance of tangent to vertex along line segment in capsule
 				float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) );
 				float4 closestPoint = (worldC1 + float4(segment * distanceAlongSegment, 0.f));
 				float distanceFromLine = length(position - closestPoint);
 				float distanceFromC1 = length(worldC1 - position);
 				float distanceFromC2 = length(worldC2 - position);
 				// Final distance from collision, point to push from, direction to push in
 				// for impulse force
 				float dist;
 				float3 normalVector;
 				if( distanceAlongSegment < 0 )
 				{
 					dist = distanceFromC1;
 					normalVector = normalize(position - worldC1).xyz;
 				} else if( distanceAlongSegment > 1.f ) {
 					dist = distanceFromC2;
 					normalVector = normalize(position - worldC2).xyz;	
 				} else {
 					dist = distanceFromLine;
 					normalVector = normalize(position - closestPoint).xyz;
 				}
 				float3 colliderLinearVelocity = localCollisionShapes[collision].linearVelocity.xyz;
 				float3 colliderAngularVelocity = localCollisionShapes[collision].angularVelocity.xyz;
 				float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz - worldTransform._m03_m13_m23);
 				float minDistance = capsuleRadius + capsuleMargin;
 				// In case of no collision, this is the value of velocity
 				velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
 				// Check for a collision
 				if( dist < minDistance )
 				{
 					// Project back to surface along normal
 					position = position + float4((minDistance - dist)*normalVector*0.9, 0.f);
 					velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
 					float3 relativeVelocity = velocity - velocityOfSurfacePoint;
 					float3 p1 = normalize(cross(normalVector, segment));
 					float3 p2 = normalize(cross(p1, normalVector));
 					// Full friction is sum of velocities in each direction of plane
 					float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2);
 					// Real friction is peak friction corrected by friction coefficients
 					frictionVector = frictionVector * (colliderFriction*clothFriction);
 					float approachSpeed = dot(relativeVelocity, normalVector);
 					if( approachSpeed <= 0.0 )
 						forceOnVertex -= frictionVector;
 				}
 			}
 		}
 	} else {
 		// Update velocity	
 		float3 difference = position.xyz - previousPosition.xyz;
 		velocity = difference*velocityCoefficient*isolverdt;			
 	}
 	g_vertexVelocities[nodeID] = float4(velocity, 0.f);	
 	// Update external force
 	g_vertexForces[nodeID] = float4(forceOnVertex, 0.f);
 	g_vertexPositions[nodeID] = float4(position.xyz, 0.f);
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ComputeBounds.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ComputeBounds.cl
@@ -0,0 +1,80 @@
 MSTRINGIFY(
 #pragma OPENCL EXTENSION cl_amd_printf : enable \n
 __kernel void
 ComputeBoundsKernel( 
 	int numNodes,
 	int numSoftBodies,
 	__global int * g_vertexClothIdentifier,
 	__global float4 * g_vertexPositions,
 	volatile __global uint * g_clothMinBounds,
 	volatile __global uint * g_clothMaxBounds,
 	volatile __local uint * clothMinBounds,
 	volatile __local uint * clothMaxBounds)
 {
 	// Init min and max bounds arrays
 	if( get_local_id(0) < numSoftBodies )
 	{
 		clothMinBounds[get_local_id(0)*4] = UINT_MAX;
 		clothMinBounds[get_local_id(0)*4+1] = UINT_MAX;
 		clothMinBounds[get_local_id(0)*4+2] = UINT_MAX;
 		clothMinBounds[get_local_id(0)*4+3] = UINT_MAX;
 		clothMaxBounds[get_local_id(0)*4] = 0;
 		clothMaxBounds[get_local_id(0)*4+1] = 0;
 		clothMaxBounds[get_local_id(0)*4+2] = 0;
 		clothMaxBounds[get_local_id(0)*4+3] = 0;
 	}
 	barrier(CLK_GLOBAL_MEM_FENCE);
 	int nodeID = get_global_id(0);
 	if( nodeID < numNodes )
 	{	
 		int clothIdentifier = g_vertexClothIdentifier[get_global_id(0)];
 		if( clothIdentifier >= 0 )
 		{
 			float3 position = g_vertexPositions[get_global_id(0)].xyz;
 			/* Reinterpret position as uint */
 			uint3 positionUInt = (uint3)(as_uint(position.x), as_uint(position.y), as_uint(position.z));
 			/* Invert sign bit of positives and whole of negatives to allow comparison as unsigned ints */
 			positionUInt.x ^= (1+~(positionUInt.x >> 31) | 0x80000000);
 			positionUInt.y ^= (1+~(positionUInt.y >> 31) | 0x80000000);		
 			positionUInt.z ^= (1+~(positionUInt.z >> 31) | 0x80000000);
 			/* Min/max with the LDS values */
 			atomic_min(&(clothMinBounds[clothIdentifier*4]), positionUInt.x);
 			atomic_min(&(clothMinBounds[clothIdentifier*4+1]), positionUInt.y);
 			atomic_min(&(clothMinBounds[clothIdentifier*4+2]), positionUInt.z);
 			atomic_max(&(clothMaxBounds[clothIdentifier*4]), positionUInt.x);
 			atomic_max(&(clothMaxBounds[clothIdentifier*4+1]), positionUInt.y);
 			atomic_max(&(clothMaxBounds[clothIdentifier*4+2]), positionUInt.z);
 		}
 	}
 	barrier(CLK_GLOBAL_MEM_FENCE);
 	/* Use global atomics to update the global versions of the data*/
 	if( get_local_id(0) < numSoftBodies )
 	{
 		/*atomic_min(&(g_clothMinBounds[get_local_id(0)].x), clothMinBounds[get_local_id(0)].x);*/
 		atomic_min(&(g_clothMinBounds[get_local_id(0)*4]), clothMinBounds[get_local_id(0)*4]);
 		atomic_min(&(g_clothMinBounds[get_local_id(0)*4+1]), clothMinBounds[get_local_id(0)*4+1]);
 		atomic_min(&(g_clothMinBounds[get_local_id(0)*4+2]), clothMinBounds[get_local_id(0)*4+2]);
 		atomic_max(&(g_clothMaxBounds[get_local_id(0)*4]), clothMaxBounds[get_local_id(0)*4]);		
 		atomic_max(&(g_clothMaxBounds[get_local_id(0)*4+1]), clothMaxBounds[get_local_id(0)*4+1]);
 		atomic_max(&(g_clothMaxBounds[get_local_id(0)*4+2]), clothMaxBounds[get_local_id(0)*4+2]);
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/OutputToVertexArray.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/OutputToVertexArray.cl
@@ -0,0 +1,57 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 cbuffer OutputToVertexArrayCB : register( b0 )
 {
 	int startNode;
 	int numNodes;
 	int offsetX;
 	int strideX;
 	int offsetN;	
 	int strideN;
 	int padding1;
 	int padding2;
 };
 StructuredBuffer<float4> g_nodesx : register( t0 );
 StructuredBuffer<float4> g_nodesn : register( t1 );
 RWStructuredBuffer<float> g_vertexBuffer : register( u0 );
 [numthreads(128, 1, 1)]
 void 
 OutputToVertexArrayKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
 {
 	int nodeID = DTid.x;
 	if( nodeID < numNodes )
 	{			
 		float4 nodeX = g_nodesx[nodeID + startNode];
 		float4 nodeN = g_nodesn[nodeID + startNode];
 		// Stride should account for the float->float4 conversion
 		int positionDestination = nodeID * strideX + offsetX;		
 		g_vertexBuffer[positionDestination] = nodeX.x;
 		g_vertexBuffer[positionDestination+1] = nodeX.y;
 		g_vertexBuffer[positionDestination+2] = nodeX.z;
 		int normalDestination = nodeID * strideN + offsetN;
 		g_vertexBuffer[normalDestination] = nodeN.x;
 		g_vertexBuffer[normalDestination+1] = nodeN.y;
 		g_vertexBuffer[normalDestination+2] = nodeN.z;		
 	}
 }
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositionsSIMDBatched.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositionsSIMDBatched.cl
@@ -0,0 +1,139 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 MSTRINGIFY(
 float mydot3(float4 a, float4 b)
 {
   return a.x*b.x + a.y*b.y + a.z*b.z;
 }
 __kernel void 
 SolvePositionsFromLinksKernel( 
 	const int startWaveInBatch,
 	const int numWaves,
 	const float kst,
 	const float ti,
 	__global int2 *g_wavefrontBatchCountsVertexCounts,
 	__global int *g_vertexAddressesPerWavefront,
 	__global int2 * g_linksVertexIndices,
 	__global float * g_linksMassLSC,
 	__global float * g_linksRestLengthSquared,
 	__global float * g_verticesInverseMass,
 	__global float4 * g_vertexPositions,
 	__local int2 *wavefrontBatchCountsVertexCounts,
 	__local float4 *vertexPositionSharedData,
 	__local float *vertexInverseMassSharedData)
 {
 	const int laneInWavefront = (get_global_id(0) & (WAVEFRONT_SIZE-1));
 	const int wavefront = startWaveInBatch + (get_global_id(0) / WAVEFRONT_SIZE);
 	const int firstWavefrontInBlock = startWaveInBatch + get_group_id(0) * WAVEFRONT_BLOCK_MULTIPLIER;
 	const int localWavefront = wavefront - firstWavefrontInBlock;
 	// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier	
 	if( wavefront < (startWaveInBatch + numWaves) )
 	{	
 		// Load the batch counts for the wavefronts
 		// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier
 		if( laneInWavefront == 0 )
 		{
 			int2 batchesAndVertexCountsWithinWavefront = g_wavefrontBatchCountsVertexCounts[wavefront];
 			wavefrontBatchCountsVertexCounts[localWavefront] = batchesAndVertexCountsWithinWavefront;
 		}
 		mem_fence(CLK_LOCAL_MEM_FENCE);
 		int2 batchesAndVerticesWithinWavefront = wavefrontBatchCountsVertexCounts[localWavefront];
 		int batchesWithinWavefront = batchesAndVerticesWithinWavefront.x;
 		int verticesUsedByWave = batchesAndVerticesWithinWavefront.y;
 		// Load the vertices for the wavefronts
 		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
 		{
 			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
 			vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_vertexPositions[vertexAddress];
 			vertexInverseMassSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_verticesInverseMass[vertexAddress];
 		}
 		mem_fence(CLK_LOCAL_MEM_FENCE);
 		// Loop through the batches performing the solve on each in LDS
 		int baseDataLocationForWave = WAVEFRONT_SIZE * wavefront * MAX_BATCHES_PER_WAVE;	
 		//for( int batch = 0; batch < batchesWithinWavefront; ++batch )
 		int batch = 0;
 		do
 		{
 			int baseDataLocation = baseDataLocationForWave + WAVEFRONT_SIZE * batch;
 			int locationOfValue = baseDataLocation + laneInWavefront;
 			// These loads should all be perfectly linear across the WF
 			int2 localVertexIndices = g_linksVertexIndices[locationOfValue];
 			float massLSC = g_linksMassLSC[locationOfValue];
 			float restLengthSquared = g_linksRestLengthSquared[locationOfValue];
 			// LDS vertex addresses based on logical wavefront number in block and loaded index
 			int vertexAddress0 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.x;
 			int vertexAddress1 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.y;
 			float4 position0 = vertexPositionSharedData[vertexAddress0];
 			float4 position1 = vertexPositionSharedData[vertexAddress1];
 			float inverseMass0 = vertexInverseMassSharedData[vertexAddress0];
 			float inverseMass1 = vertexInverseMassSharedData[vertexAddress1]; 
 			float4 del = position1 - position0;
 			float len = mydot3(del, del);
 			float k = 0;
 			if( massLSC > 0.0f )
 			{		
 				k = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
 			}
 			position0 = position0 - del*(k*inverseMass0);
 			position1 = position1 + del*(k*inverseMass1);
 			// Ensure compiler does not re-order memory operations
 			mem_fence(CLK_LOCAL_MEM_FENCE);
 			vertexPositionSharedData[vertexAddress0] = position0;
 			vertexPositionSharedData[vertexAddress1] = position1;
 			// Ensure compiler does not re-order memory operations
 			mem_fence(CLK_LOCAL_MEM_FENCE);
 			++batch;
 		} while( batch < batchesWithinWavefront );
 		// Update the global memory vertices for the wavefronts
 		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
 		{
 			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
 			g_vertexPositions[vertexAddress] = (float4)(vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex].xyz, 0.f);
 		}		
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocities.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocities.cl
@@ -0,0 +1,195 @@
 MSTRINGIFY(
 typedef struct 
 {
 	int firstObject;
 	int endObject;
 } CollisionObjectIndices;
 typedef struct 
 {
 	float4 shapeTransform[4]; // column major 4x4 matrix
 	float4 linearVelocity;
 	float4 angularVelocity;
 	int softBodyIdentifier;
 	int collisionShapeType;
 	// Shape information
 	// Compressed from the union
 	float radius;
 	float halfHeight;
 	int upAxis;
 	float margin;
 	float friction;
 	int padding0;
 } CollisionShapeDescription;
 /* From btBroadphaseProxy.h */
 __constant int CAPSULE_SHAPE_PROXYTYPE = 10;
 /* Multiply column-major matrix against vector */
 float4 matrixVectorMul( float4 matrix[4], float4 vector )
 {
 	float4 returnVector;
 	float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
 	float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
 	float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
 	float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
 	returnVector.x = dot(row0, vector);
 	returnVector.y = dot(row1, vector);
 	returnVector.z = dot(row2, vector);
 	returnVector.w = dot(row3, vector);
 	return returnVector;
 }
 __kernel void 
 SolveCollisionsAndUpdateVelocitiesKernel( 
 	const int numNodes,
 	const float isolverdt,
 	__global int *g_vertexClothIdentifier,
 	__global float4 *g_vertexPreviousPositions,
 	__global float * g_perClothFriction,
 	__global float * g_clothDampingFactor,
 	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
 	__global CollisionShapeDescription * g_collisionObjectDetails,
 	__global float4 * g_vertexForces,
 	__global float4 *g_vertexVelocities,
 	__global float4 *g_vertexPositions)
 {
 	int nodeID = get_global_id(0);
 	float3 forceOnVertex = (float3)(0.f, 0.f, 0.f);
 	if( get_global_id(0) < numNodes )
 	{	
 		int clothIdentifier = g_vertexClothIdentifier[nodeID];
 		// Abort if this is not a valid cloth
 		if( clothIdentifier < 0 )
 			return;
 		float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f);
 		float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f);
 		float3 velocity;
 		float clothFriction = g_perClothFriction[clothIdentifier];
 		float dampingFactor = g_clothDampingFactor[clothIdentifier];
 		float velocityCoefficient = (1.f - dampingFactor);		
 		CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
 		if( collisionObjectIndices.firstObject != collisionObjectIndices.endObject )
 		{
 			velocity = (float3)(15, 0, 0);
 			/* We have some possible collisions to deal with */
 			for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision )
 			{
 				CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision];
 				float colliderFriction = shapeDescription.friction;
 				if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
 				{
 					/* Colliding with a capsule */
 					float capsuleHalfHeight = shapeDescription.halfHeight;
 					float capsuleRadius = shapeDescription.radius;
 					float capsuleMargin = shapeDescription.margin;
 					int capsuleupAxis = shapeDescription.upAxis;
 					/* Four columns of worldTransform matrix */
 					float4 worldTransform[4];
 					worldTransform[0] = shapeDescription.shapeTransform[0];
 					worldTransform[1] = shapeDescription.shapeTransform[1];
 					worldTransform[2] = shapeDescription.shapeTransform[2];
 					worldTransform[3] = shapeDescription.shapeTransform[3];
 					// Correctly define capsule centerline vector 
 					float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); 
 					float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f);
 					c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
 					c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
 					c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
 					c2.x = -c1.x;
 					c2.y = -c1.y;
 					c2.z = -c1.z;
 					float4 worldC1 = matrixVectorMul(worldTransform, c1);
 					float4 worldC2 = matrixVectorMul(worldTransform, c2);
 					float3 segment = (worldC2 - worldC1).xyz;
 					/* compute distance of tangent to vertex along line segment in capsule */
 					float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) );
 					float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment, 0.f));
 					float distanceFromLine = length(position - closestPoint);
 					float distanceFromC1 = length(worldC1 - position);
 					float distanceFromC2 = length(worldC2 - position);
 					/* Final distance from collision, point to push from, direction to push in
 					   for impulse force */
 					float dist;
 					float3 normalVector;
 					if( distanceAlongSegment < 0 )
 					{
 						dist = distanceFromC1;
 						normalVector = normalize(position - worldC1).xyz;
 					} else if( distanceAlongSegment > 1.f ) {
 						dist = distanceFromC2;
 						normalVector = normalize(position - worldC2).xyz;	
 					} else {
 						dist = distanceFromLine;
 						normalVector = normalize(position - closestPoint).xyz;
 					}
 					float3 colliderLinearVelocity = shapeDescription.linearVelocity.xyz;
 					float3 colliderAngularVelocity = shapeDescription.angularVelocity.xyz;
 					float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz - (float3)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w));
 					float minDistance = capsuleRadius + capsuleMargin;
 					/* In case of no collision, this is the value of velocity */
 					velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
 					// Check for a collision
 					if( dist < minDistance )
 					{
 						/* Project back to surface along normal */
 						position = position + (float4)((minDistance - dist)*normalVector*0.9f, 0.f);
 						velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
 						float3 relativeVelocity = velocity - velocityOfSurfacePoint;
 						float3 p1 = normalize(cross(normalVector, segment));
 						float3 p2 = normalize(cross(p1, normalVector));
 						/* Full friction is sum of velocities in each direction of plane */
 						float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2);
 						/* Real friction is peak friction corrected by friction coefficients */
 						frictionVector = frictionVector * (colliderFriction*clothFriction);
 						float approachSpeed = dot(relativeVelocity, normalVector);
 						if( approachSpeed <= 0.0f )
 							forceOnVertex -= frictionVector;
 					}
 				}
 			}
 		} else {
 			/* Update velocity	 */
 			float3 difference = position.xyz - previousPosition.xyz;
 			velocity = difference*velocityCoefficient*isolverdt;			
 		}
 		g_vertexVelocities[nodeID] = (float4)(velocity, 0.f);	
 		/* Update external force */
 		g_vertexForces[nodeID] = (float4)(forceOnVertex, 0.f);
 		g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f);
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl
@@ -0,0 +1,213 @@
 MSTRINGIFY(
 typedef struct 
 {
 	int firstObject;
 	int endObject;
 } CollisionObjectIndices;
 typedef struct 
 {
 	float4 shapeTransform[4]; /* column major 4x4 matrix */
 	float4 linearVelocity;
 	float4 angularVelocity;
 	int softBodyIdentifier;
 	int collisionShapeType;
 	// Shape information
 	// Compressed from the union
 	float radius;
 	float halfHeight;
 	int upAxis;
 	float margin;
 	float friction;
 	int padding0;
 } CollisionShapeDescription;
 /* From btBroadphaseProxy.h */
 __constant int CAPSULE_SHAPE_PROXYTYPE = 10;
 /* Multiply column-major matrix against vector */
 float4 matrixVectorMul( float4 matrix[4], float4 vector )
 {
 	float4 returnVector;
 	float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
 	float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
 	float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
 	float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
 	returnVector.x = dot(row0, vector);
 	returnVector.y = dot(row1, vector);
 	returnVector.z = dot(row2, vector);
 	returnVector.w = dot(row3, vector);
 	return returnVector;
 }
 __kernel void 
 SolveCollisionsAndUpdateVelocitiesKernel( 
 	const int numNodes,
 	const float isolverdt,
 	__global int *g_vertexClothIdentifier,
 	__global float4 *g_vertexPreviousPositions,
 	__global float * g_perClothFriction,
 	__global float * g_clothDampingFactor,
 	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
 	__global CollisionShapeDescription * g_collisionObjectDetails,
 	__global float4 * g_vertexForces,
 	__global float4 *g_vertexVelocities,
 	__global float4 *g_vertexPositions,
 	__local CollisionShapeDescription *localCollisionShapes)
 {
 	int nodeID = get_global_id(0);
 	float3 forceOnVertex = (float3)(0.f, 0.f, 0.f);
 	int clothIdentifier = g_vertexClothIdentifier[nodeID];
 	// Abort if this is not a valid cloth
 	if( clothIdentifier < 0 )
 		return;
 	float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f);
 	float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f);
 	float3 velocity;
 	float clothFriction = g_perClothFriction[clothIdentifier];
 	float dampingFactor = g_clothDampingFactor[clothIdentifier];
 	float velocityCoefficient = (1.f - dampingFactor);		
 	CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
 	int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
 	if( numObjects > 0 )
 	{
 		/* We have some possible collisions to deal with */
 		/* First load all of the collision objects into LDS */
 		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
 		if( get_local_id(0) < numObjects )
 		{
 			localCollisionShapes[get_local_id(0)] = g_collisionObjectDetails[ collisionObjectIndices.firstObject + get_local_id(0) ];
 		}
 	}
 	/* Safe as the vertices are padded so that not more than one soft body is in a group */
 	barrier(CLK_LOCAL_MEM_FENCE);
 	/* Annoyingly, even though I know the flow control is not varying, the compiler will not let me skip this */
 	if( numObjects > 0 )
 	{
 		velocity = (float3)(0, 0, 0);
 		// We have some possible collisions to deal with
 		for( int collision = 0; collision < numObjects; ++collision )
 		{
 			CollisionShapeDescription shapeDescription = localCollisionShapes[collision];
 			float colliderFriction = shapeDescription.friction;
 			if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
 			{
 				/* Colliding with a capsule */
 				float capsuleHalfHeight = localCollisionShapes[collision].halfHeight;
 				float capsuleRadius = localCollisionShapes[collision].radius;
 				float capsuleMargin = localCollisionShapes[collision].margin;
 				int capsuleupAxis = localCollisionShapes[collision].upAxis;
 				float4 worldTransform[4];
 				worldTransform[0] = localCollisionShapes[collision].shapeTransform[0];
 				worldTransform[1] = localCollisionShapes[collision].shapeTransform[1];
 				worldTransform[2] = localCollisionShapes[collision].shapeTransform[2];
 				worldTransform[3] = localCollisionShapes[collision].shapeTransform[3];
 				// Correctly define capsule centerline vector 
 				float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); 
 				float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f);
 				c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
 				c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
 				c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
 				c2.x = -c1.x;
 				c2.y = -c1.y;
 				c2.z = -c1.z;
 				float4 worldC1 = matrixVectorMul(worldTransform, c1);
 				float4 worldC2 = matrixVectorMul(worldTransform, c2);
 				float3 segment = (worldC2 - worldC1).xyz;
 				/* compute distance of tangent to vertex along line segment in capsule */
 				float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) );
 				float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment, 0.f));
 				float distanceFromLine = length(position - closestPoint);
 				float distanceFromC1 = length(worldC1 - position);
 				float distanceFromC2 = length(worldC2 - position);
 				/* Final distance from collision, point to push from, direction to push in
 				   for impulse force */
 				float dist;
 				float3 normalVector;
 				if( distanceAlongSegment < 0 )
 				{
 					dist = distanceFromC1;
 					normalVector = normalize(position - worldC1).xyz;
 				} else if( distanceAlongSegment > 1.f ) {
 					dist = distanceFromC2;
 					normalVector = normalize(position - worldC2).xyz;	
 				} else {
 					dist = distanceFromLine;
 					normalVector = normalize(position - closestPoint).xyz;
 				}
 				float3 colliderLinearVelocity = localCollisionShapes[collision].linearVelocity.xyz;
 				float3 colliderAngularVelocity = localCollisionShapes[collision].angularVelocity.xyz;
 				float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz -  (float3)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w));
 				float minDistance = capsuleRadius + capsuleMargin;
 				/* In case of no collision, this is the value of velocity */
 				velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
 				/* Check for a collision */
 				if( dist < minDistance )
 				{
 					/* Project back to surface along normal */
 					position = position + (float4)((minDistance - dist)*normalVector*0.9f, 0.f);
 					velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
 					float3 relativeVelocity = velocity - velocityOfSurfacePoint;
 					float3 p1 = normalize(cross(normalVector, segment));
 					float3 p2 = normalize(cross(p1, normalVector));
 					/* Full friction is sum of velocities in each direction of plane */
 					float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2);
 					/* Real friction is peak friction corrected by friction coefficients */
 					frictionVector = frictionVector * (colliderFriction*clothFriction);
 					float approachSpeed = dot(relativeVelocity, normalVector);
 					if( approachSpeed <= 0.0f )
 						forceOnVertex -= frictionVector;
 				}
 			}
 		}
 	} else {
 		/* Update velocity	*/
 		float3 difference = position.xyz - previousPosition.xyz;
 		velocity = difference*velocityCoefficient*isolverdt;			
 	}
 	g_vertexVelocities[nodeID] = (float4)(velocity, 0.f);	
 	/* Update external force */
 	g_vertexForces[nodeID] = (float4)(forceOnVertex, 0.f);
 	g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f);
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ComputeBounds.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ComputeBounds.cl
@@ -0,0 +1,82 @@
 MSTRINGIFY(
 #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n
 #pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n
 __kernel void
 ComputeBoundsKernel( 
 	const int numNodes,
 	const int numSoftBodies,
 	__global int * g_vertexClothIdentifier,
 	__global float4 * g_vertexPositions,
 	/* Unfortunately, to get the atomics below to work these arrays cannot be */
 	/* uint4, though that is the layout of the data */
 	/* Therefore this is little-endian-only code */
 	volatile __global uint * g_clothMinBounds,
 	volatile __global uint * g_clothMaxBounds,
 	volatile __local uint * clothMinBounds,
 	volatile __local uint * clothMaxBounds)
 {
 	// Init min and max bounds arrays
 	if( get_local_id(0) < numSoftBodies )
 	{
 		clothMinBounds[get_local_id(0)*4] = UINT_MAX;
 		clothMinBounds[get_local_id(0)*4+1] = UINT_MAX;
 		clothMinBounds[get_local_id(0)*4+2] = UINT_MAX;
 		clothMinBounds[get_local_id(0)*4+3] = UINT_MAX;
 		clothMaxBounds[get_local_id(0)*4] = 0;
 		clothMaxBounds[get_local_id(0)*4+1] = 0;
 		clothMaxBounds[get_local_id(0)*4+2] = 0;
 		clothMaxBounds[get_local_id(0)*4+3] = 0;
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	int nodeID = get_global_id(0);
 	if( nodeID < numNodes )
 	{	
 		int clothIdentifier = g_vertexClothIdentifier[nodeID];
 		if( clothIdentifier >= 0 )
 		{
 			float4 position = (float4)(g_vertexPositions[nodeID].xyz, 0.f);
 			/* Reinterpret position as uint */
 			uint4 positionUInt = (uint4)(as_uint(position.x), as_uint(position.y), as_uint(position.z), 0);
 			/* Invert sign bit of positives and whole of negatives to allow comparison as unsigned ints */
 			positionUInt.x ^= (1+~(positionUInt.x >> 31) | 0x80000000);
 			positionUInt.y ^= (1+~(positionUInt.y >> 31) | 0x80000000);		
 			positionUInt.z ^= (1+~(positionUInt.z >> 31) | 0x80000000);
 			// Min/max with the LDS values
 			atom_min(&(clothMinBounds[clothIdentifier*4]), positionUInt.x);
 			atom_min(&(clothMinBounds[clothIdentifier*4+1]), positionUInt.y);
 			atom_min(&(clothMinBounds[clothIdentifier*4+2]), positionUInt.z);
 			atom_max(&(clothMaxBounds[clothIdentifier*4]), positionUInt.x);
 			atom_max(&(clothMaxBounds[clothIdentifier*4+1]), positionUInt.y);
 			atom_max(&(clothMaxBounds[clothIdentifier*4+2]), positionUInt.z);
 		}
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	/* Use global atomics to update the global versions of the data */
 	if( get_local_id(0) < numSoftBodies )
 	{
 		/*atom_min(&(g_clothMinBounds[get_local_id(0)].x), clothMinBounds[get_local_id(0)].x);*/
 		atom_min(&(g_clothMinBounds[get_local_id(0)*4]), clothMinBounds[get_local_id(0)*4]);
 		atom_min(&(g_clothMinBounds[get_local_id(0)*4+1]), clothMinBounds[get_local_id(0)*4+1]);
 		atom_min(&(g_clothMinBounds[get_local_id(0)*4+2]), clothMinBounds[get_local_id(0)*4+2]);
 		atom_max(&(g_clothMaxBounds[get_local_id(0)*4]), clothMaxBounds[get_local_id(0)*4]);		
 		atom_max(&(g_clothMaxBounds[get_local_id(0)*4+1]), clothMaxBounds[get_local_id(0)*4+1]);
 		atom_max(&(g_clothMaxBounds[get_local_id(0)*4+2]), clothMaxBounds[get_local_id(0)*4+2]);
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/OutputToVertexArray.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/OutputToVertexArray.cl
@@ -0,0 +1,46 @@
 MSTRINGIFY(
 __kernel void 
 OutputToVertexArrayWithNormalsKernel( 
 	const int startNode, const int numNodes, __global float *g_vertexBuffer,
 	const int positionOffset, const int positionStride, const __global float4* g_vertexPositions, 
 	const int normalOffset, const int normalStride, const __global float4* g_vertexNormals  )
 {
 	int nodeID = get_global_id(0);
 	if( nodeID < numNodes )
 	{			
 		float4 position = g_vertexPositions[nodeID + startNode];
 		float4 normal = g_vertexNormals[nodeID + startNode];
 		// Stride should account for the float->float4 conversion
 		int positionDestination = nodeID * positionStride + positionOffset;		
 		g_vertexBuffer[positionDestination] = position.x;
 		g_vertexBuffer[positionDestination+1] = position.y;
 		g_vertexBuffer[positionDestination+2] = position.z;
 		int normalDestination = nodeID * normalStride + normalOffset;
 		g_vertexBuffer[normalDestination] = normal.x;
 		g_vertexBuffer[normalDestination+1] = normal.y;
 		g_vertexBuffer[normalDestination+2] = normal.z;		
 	}
 }
 __kernel void 
 OutputToVertexArrayWithoutNormalsKernel(
 	const int startNode, const int numNodes, __global float *g_vertexBuffer,
 	const int positionOffset, const int positionStride, const __global float4* g_vertexPositions )
 {
 	int nodeID = get_global_id(0);
 	if( nodeID < numNodes )
 	{			
 		float4 position = g_vertexPositions[nodeID + startNode];
 		// Stride should account for the float->float4 conversion
 		int positionDestination = nodeID * positionStride + positionOffset;		
 		g_vertexBuffer[positionDestination] = position.x;
 		g_vertexBuffer[positionDestination+1] = position.y;
 		g_vertexBuffer[positionDestination+2] = position.z;		
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositionsSIMDBatched.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositionsSIMDBatched.cl
@@ -0,0 +1,140 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 MSTRINGIFY(
 float mydot3(float4 a, float4 b)
 {
   return a.x*b.x + a.y*b.y + a.z*b.z;
 }
 __kernel __attribute__((reqd_work_group_size(WAVEFRONT_BLOCK_MULTIPLIER*WAVEFRONT_SIZE, 1, 1)))
 void 
 SolvePositionsFromLinksKernel( 
 	const int startWaveInBatch,
 	const int numWaves,
 	const float kst,
 	const float ti,
 	__global int2 *g_wavefrontBatchCountsVertexCounts,
 	__global int *g_vertexAddressesPerWavefront,
 	__global int2 * g_linksVertexIndices,
 	__global float * g_linksMassLSC,
 	__global float * g_linksRestLengthSquared,
 	__global float * g_verticesInverseMass,
 	__global float4 * g_vertexPositions,
 	__local int2 *wavefrontBatchCountsVertexCounts,
 	__local float4 *vertexPositionSharedData,
 	__local float *vertexInverseMassSharedData)
 {
 	const int laneInWavefront = (get_global_id(0) & (WAVEFRONT_SIZE-1));
 	const int wavefront = startWaveInBatch + (get_global_id(0) / WAVEFRONT_SIZE);
 	const int firstWavefrontInBlock = startWaveInBatch + get_group_id(0) * WAVEFRONT_BLOCK_MULTIPLIER;
 	const int localWavefront = wavefront - firstWavefrontInBlock;
 	// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier	
 	if( wavefront < (startWaveInBatch + numWaves) )
 	{	
 		// Load the batch counts for the wavefronts
 		// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier
 		if( laneInWavefront == 0 )
 		{
 			int2 batchesAndVertexCountsWithinWavefront = g_wavefrontBatchCountsVertexCounts[wavefront];
 			wavefrontBatchCountsVertexCounts[localWavefront] = batchesAndVertexCountsWithinWavefront;
 		}
 		barrier(CLK_LOCAL_MEM_FENCE);
 		int2 batchesAndVerticesWithinWavefront = wavefrontBatchCountsVertexCounts[localWavefront];
 		int batchesWithinWavefront = batchesAndVerticesWithinWavefront.x;
 		int verticesUsedByWave = batchesAndVerticesWithinWavefront.y;
 		// Load the vertices for the wavefronts
 		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
 		{
 			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
 			vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_vertexPositions[vertexAddress];
 			vertexInverseMassSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_verticesInverseMass[vertexAddress];
 		}
 		barrier(CLK_LOCAL_MEM_FENCE);
 		// Loop through the batches performing the solve on each in LDS
 		int baseDataLocationForWave = WAVEFRONT_SIZE * wavefront * MAX_BATCHES_PER_WAVE;	
 		//for( int batch = 0; batch < batchesWithinWavefront; ++batch )
 		int batch = 0;
 		do
 		{
 			int baseDataLocation = baseDataLocationForWave + WAVEFRONT_SIZE * batch;
 			int locationOfValue = baseDataLocation + laneInWavefront;
 			// These loads should all be perfectly linear across the WF
 			int2 localVertexIndices = g_linksVertexIndices[locationOfValue];
 			float massLSC = g_linksMassLSC[locationOfValue];
 			float restLengthSquared = g_linksRestLengthSquared[locationOfValue];
 			// LDS vertex addresses based on logical wavefront number in block and loaded index
 			int vertexAddress0 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.x;
 			int vertexAddress1 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.y;
 			float4 position0 = vertexPositionSharedData[vertexAddress0];
 			float4 position1 = vertexPositionSharedData[vertexAddress1];
 			float inverseMass0 = vertexInverseMassSharedData[vertexAddress0];
 			float inverseMass1 = vertexInverseMassSharedData[vertexAddress1]; 
 			float4 del = position1 - position0;
 			float len = mydot3(del, del);
 			float k = 0;
 			if( massLSC > 0.0f )
 			{		
 				k = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
 			}
 			position0 = position0 - del*(k*inverseMass0);
 			position1 = position1 + del*(k*inverseMass1);
 			// Ensure compiler does not re-order memory operations
 			barrier(CLK_LOCAL_MEM_FENCE);
 			vertexPositionSharedData[vertexAddress0] = position0;
 			vertexPositionSharedData[vertexAddress1] = position1;
 			// Ensure compiler does not re-order memory operations
 			barrier(CLK_LOCAL_MEM_FENCE);
 			++batch;
 		} while( batch < batchesWithinWavefront );
 		// Update the global memory vertices for the wavefronts
 		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
 		{
 			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
 			g_vertexPositions[vertexAddress] = (float4)(vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex].xyz, 0.f);
 		}		
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocities.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocities.cl
@@ -0,0 +1,204 @@
 MSTRINGIFY(
 #pragma OPENCL EXTENSION cl_amd_printf : enable\n
 float mydot3(float4 a, float4 b)
 {
   return a.x*b.x + a.y*b.y + a.z*b.z;
 }
 typedef struct 
 {
 	int firstObject;
 	int endObject;
 } CollisionObjectIndices;
 typedef struct 
 {
 	float4 shapeTransform[4]; // column major 4x4 matrix
 	float4 linearVelocity;
 	float4 angularVelocity;
 	int softBodyIdentifier;
 	int collisionShapeType;
 	// Shape information
 	// Compressed from the union
 	float radius;
 	float halfHeight;
 	int upAxis;
 	float margin;
 	float friction;
 	int padding0;
 } CollisionShapeDescription;
 // From btBroadphaseProxy.h
 __constant int CAPSULE_SHAPE_PROXYTYPE = 10;
 // Multiply column-major matrix against vector
 float4 matrixVectorMul( float4 matrix[4], float4 vector )
 {
 	float4 returnVector;
 	float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
 	float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
 	float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
 	float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
 	returnVector.x = dot(row0, vector);
 	returnVector.y = dot(row1, vector);
 	returnVector.z = dot(row2, vector);
 	returnVector.w = dot(row3, vector);
 	return returnVector;
 }
 __kernel void 
 SolveCollisionsAndUpdateVelocitiesKernel( 
 	const int numNodes,
 	const float isolverdt,
 	__global int *g_vertexClothIdentifier,
 	__global float4 *g_vertexPreviousPositions,
 	__global float * g_perClothFriction,
 	__global float * g_clothDampingFactor,
 	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
 	__global CollisionShapeDescription * g_collisionObjectDetails,
 	__global float4 * g_vertexForces,
 	__global float4 *g_vertexVelocities,
 	__global float4 *g_vertexPositions)
 {
 	int nodeID = get_global_id(0);
 	float4 forceOnVertex = (float4)(0.f, 0.f, 0.f, 0.f);
 	if( get_global_id(0) < numNodes )
 	{	
 		int clothIdentifier = g_vertexClothIdentifier[nodeID];
 		// Abort if this is not a valid cloth
 		if( clothIdentifier < 0 )
 			return;
 		float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f);
 		float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f);
 		float clothFriction = g_perClothFriction[clothIdentifier];
 		float dampingFactor = g_clothDampingFactor[clothIdentifier];
 		float velocityCoefficient = (1.f - dampingFactor);		
 		float4 difference = position - previousPosition;
 		float4 velocity = difference*velocityCoefficient*isolverdt;
 		CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
 		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
 		if( numObjects > 0 )
 		{
 			// We have some possible collisions to deal with
 			for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision )
 			{
 				CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision];
 				float colliderFriction = shapeDescription.friction;
 				if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
 				{
 					// Colliding with a capsule
 					float capsuleHalfHeight = shapeDescription.halfHeight;
 					float capsuleRadius = shapeDescription.radius;
 					float capsuleMargin = shapeDescription.margin;
 					int capsuleupAxis = shapeDescription.upAxis;
 					// Four columns of worldTransform matrix
 					float4 worldTransform[4];
 					worldTransform[0] = shapeDescription.shapeTransform[0];
 					worldTransform[1] = shapeDescription.shapeTransform[1];
 					worldTransform[2] = shapeDescription.shapeTransform[2];
 					worldTransform[3] = shapeDescription.shapeTransform[3];
 					// Correctly define capsule centerline vector 
 					float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); 
 					float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f);
 					c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
 					c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
 					c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
 					c2.x = -c1.x;
 					c2.y = -c1.y;
 					c2.z = -c1.z;
 					float4 worldC1 = matrixVectorMul(worldTransform, c1);
 					float4 worldC2 = matrixVectorMul(worldTransform, c2);
 					float4 segment = (worldC2 - worldC1);
 					// compute distance of tangent to vertex along line segment in capsule
 					float distanceAlongSegment = -( mydot3( (worldC1 - position), segment ) / mydot3(segment, segment) );
 					float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment));
 					float distanceFromLine = length(position - closestPoint);
 					float distanceFromC1 = length(worldC1 - position);
 					float distanceFromC2 = length(worldC2 - position);
 					// Final distance from collision, point to push from, direction to push in
 					// for impulse force
 					float dist;
 					float4 normalVector;
 					if( distanceAlongSegment < 0 )
 					{
 						dist = distanceFromC1;
 						normalVector = (float4)(normalize(position - worldC1).xyz, 0.f);
 					} else if( distanceAlongSegment > 1.f ) {
 						dist = distanceFromC2;
 						normalVector = (float4)(normalize(position - worldC2).xyz, 0.f);	
 					} else {
 						dist = distanceFromLine;
 						normalVector = (float4)(normalize(position - closestPoint).xyz, 0.f);
 					}
 					float4 colliderLinearVelocity = shapeDescription.linearVelocity;
 					float4 colliderAngularVelocity = shapeDescription.angularVelocity;
 					float4 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position - (float4)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w, 0.f));
 					float minDistance = capsuleRadius + capsuleMargin;
 					// In case of no collision, this is the value of velocity
 					velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
 					// Check for a collision
 					if( dist < minDistance )
 					{
 						// Project back to surface along normal
 						position = position + (float4)((minDistance - dist)*normalVector*0.9f);
 						velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
 						float4 relativeVelocity = velocity - velocityOfSurfacePoint;
 						float4 p1 = normalize(cross(normalVector, segment));
 						float4 p2 = normalize(cross(p1, normalVector));
 						// Full friction is sum of velocities in each direction of plane
 						float4 frictionVector = p1*mydot3(relativeVelocity, p1) + p2*mydot3(relativeVelocity, p2);
 						// Real friction is peak friction corrected by friction coefficients
 						frictionVector = frictionVector * (colliderFriction*clothFriction);
 						float approachSpeed = dot(relativeVelocity, normalVector);
 						if( approachSpeed <= 0.0f )
 							forceOnVertex -= frictionVector;
 					}
 				}
 			}
 		}
 		g_vertexVelocities[nodeID] = (float4)(velocity.xyz, 0.f);	
 		// Update external force
 		g_vertexForces[nodeID] = (float4)(forceOnVertex.xyz, 0.f);
 		g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f);
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl
@@ -0,0 +1,219 @@
 MSTRINGIFY(
 float mydot3(float4 a, float4 b)
 {
   return a.x*b.x + a.y*b.y + a.z*b.z;
 }
 typedef struct 
 {
 	int firstObject;
 	int endObject;
 } CollisionObjectIndices;
 typedef struct 
 {
 	float4 shapeTransform[4]; // column major 4x4 matrix
 	float4 linearVelocity;
 	float4 angularVelocity;
 	int softBodyIdentifier;
 	int collisionShapeType;
 	// Shape information
 	// Compressed from the union
 	float radius;
 	float halfHeight;
 	int upAxis;
 	float margin;
 	float friction;
 	int padding0;
 } CollisionShapeDescription;
 // From btBroadphaseProxy.h
 __constant int CAPSULE_SHAPE_PROXYTYPE = 10;
 /* Multiply column-major matrix against vector */
 float4 matrixVectorMul( float4 matrix[4], float4 vector )
 {
 	float4 returnVector;
 	float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
 	float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
 	float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
 	float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
 	returnVector.x = dot(row0, vector);
 	returnVector.y = dot(row1, vector);
 	returnVector.z = dot(row2, vector);
 	returnVector.w = dot(row3, vector);
 	return returnVector;
 }
 __kernel void 
 SolveCollisionsAndUpdateVelocitiesKernel( 
 	const int numNodes,
 	const float isolverdt,
 	__global int *g_vertexClothIdentifier,
 	__global float4 *g_vertexPreviousPositions,
 	__global float * g_perClothFriction,
 	__global float * g_clothDampingFactor,
 	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
 	__global CollisionShapeDescription * g_collisionObjectDetails,
 	__global float4 * g_vertexForces,
 	__global float4 *g_vertexVelocities,
 	__global float4 *g_vertexPositions,
 	__local CollisionShapeDescription *localCollisionShapes)
 {
 	int nodeID = get_global_id(0);
 	float4 forceOnVertex = (float4)(0.f, 0.f, 0.f, 0.f);
 	int clothIdentifier = g_vertexClothIdentifier[nodeID];
 	// Abort if this is not a valid cloth
 	if( clothIdentifier < 0 )
 		return;
 	float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f);
 	float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f);
 	float clothFriction = g_perClothFriction[clothIdentifier];
 	float dampingFactor = g_clothDampingFactor[clothIdentifier];
 	float velocityCoefficient = (1.f - dampingFactor);		
 	// Update velocity	
 	float4 difference = position - previousPosition;
 	float4 velocity = difference*velocityCoefficient*isolverdt;			
 	CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
 	int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
 	if( numObjects > 0 )
 	{
 		// We have some possible collisions to deal with
 		// First load all of the collision objects into LDS
 		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
 		if( get_local_id(0) < numObjects )
 		{
 			localCollisionShapes[get_local_id(0)] = g_collisionObjectDetails[ collisionObjectIndices.firstObject + get_local_id(0) ];
 		}
 	}
 	// Safe as the vertices are padded so that not more than one soft body is in a group
 	barrier(CLK_LOCAL_MEM_FENCE);
 	// Annoyingly, even though I know the flow control is not varying, the compiler will not let me skip this
 	if( numObjects > 0 )
 	{
 		// We have some possible collisions to deal with
 		for( int collision = 0; collision < numObjects; ++collision )
 		{
 			//CollisionShapeDescription shapeDescription = localCollisionShapes[collision];
 			float colliderFriction = localCollisionShapes[collision].friction;
 			if( localCollisionShapes[collision].collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
 			{
 				// Colliding with a capsule
 				float capsuleHalfHeight = localCollisionShapes[collision].halfHeight;
 				float capsuleRadius = localCollisionShapes[collision].radius;
 				float capsuleMargin = localCollisionShapes[collision].margin;
 				int capsuleupAxis = localCollisionShapes[collision].upAxis;
 				float4 worldTransform[4];
 				worldTransform[0] = localCollisionShapes[collision].shapeTransform[0];
 				worldTransform[1] = localCollisionShapes[collision].shapeTransform[1];
 				worldTransform[2] = localCollisionShapes[collision].shapeTransform[2];
 				worldTransform[3] = localCollisionShapes[collision].shapeTransform[3];
 				//float4 c1 = (float4)(0.f, -capsuleHalfHeight, 0.f, 1.f); 
 				//float4 c2 = (float4)(0.f, +capsuleHalfHeight, 0.f, 1.f);
 				// Correctly define capsule centerline vector 
 				float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); 
 				float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f);
 				c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
 				c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
 				c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
 				c2.x = -c1.x;
 				c2.y = -c1.y;
 				c2.z = -c1.z;
 				float4 worldC1 = matrixVectorMul(worldTransform, c1);
 				float4 worldC2 = matrixVectorMul(worldTransform, c2);
 				float4 segment = (worldC2 - worldC1);
 				// compute distance of tangent to vertex along line segment in capsule
 				float distanceAlongSegment = -( mydot3( (worldC1 - position), segment ) / mydot3(segment, segment) );
 				float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment));
 				float distanceFromLine = length(position - closestPoint);
 				float distanceFromC1 = length(worldC1 - position);
 				float distanceFromC2 = length(worldC2 - position);
 				// Final distance from collision, point to push from, direction to push in
 				// for impulse force
 				float dist;
 				float4 normalVector;
 				if( distanceAlongSegment < 0 )
 				{
 					dist = distanceFromC1;
 					normalVector = normalize(position - worldC1);
 				} else if( distanceAlongSegment > 1.f ) {
 					dist = distanceFromC2;
 					normalVector = normalize(position - worldC2);	
 				} else {
 					dist = distanceFromLine;
 					normalVector = normalize(position - closestPoint);
 				}
 				float4 colliderLinearVelocity = localCollisionShapes[collision].linearVelocity;
 				float4 colliderAngularVelocity = localCollisionShapes[collision].angularVelocity;
 				float4 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position - (float4)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w, 0.f));
 				float minDistance = capsuleRadius + capsuleMargin;
 				// In case of no collision, this is the value of velocity
 				velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
 				// Check for a collision
 				if( dist < minDistance )
 				{
 					// Project back to surface along normal
 					position = position + (float4)((minDistance - dist)*normalVector*0.9f);
 					velocity = (position - previousPosition) * velocityCoefficient * isolverdt;
 					float4 relativeVelocity = velocity - velocityOfSurfacePoint;
 					float4 p1 = (float4)(normalize(cross(normalVector, segment)).xyz, 0.f);
 					float4 p2 = (float4)(normalize(cross(p1, normalVector)).xyz, 0.f);
 					// Full friction is sum of velocities in each direction of plane
 					float4 frictionVector = p1*mydot3(relativeVelocity, p1) + p2*mydot3(relativeVelocity, p2);
 					// Real friction is peak friction corrected by friction coefficients
 					frictionVector = frictionVector * (colliderFriction*clothFriction);
 					float approachSpeed = dot(relativeVelocity, normalVector);
 					if( approachSpeed <= 0.0f )
 						forceOnVertex -= frictionVector;
 				}
 			}
 		}
 	}
 	g_vertexVelocities[nodeID] = (float4)(velocity.xyz, 0.f);	
 	// Update external force
 	g_vertexForces[nodeID] = (float4)(forceOnVertex.xyz, 0.f);
 	g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f);
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCLSIMDAware.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCLSIMDAware.h
@@ -0,0 +1,169 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h"
 #include "btSoftBodySolverBuffer_OpenCL.h"
 #ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_SIMDAWARE_H
 #define BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_SIMDAWARE_H
 class btSoftBodyLinkDataOpenCLSIMDAware : public btSoftBodyLinkData
 {
 public:
 	bool				m_onGPU;
 	cl_command_queue	m_cqCommandQue;
 	const int m_wavefrontSize;
 	const int m_linksPerWorkItem;
 	const int m_maxLinksPerWavefront;
 	int m_maxBatchesWithinWave;
 	int m_maxVerticesWithinWave;
 	int m_numWavefronts;
 	int m_maxVertex;
 	struct NumBatchesVerticesPair
 	{
 		int numBatches;
 		int numVertices;
 	};
 	btAlignedObjectArray<int>							  m_linksPerWavefront;
 	btAlignedObjectArray<NumBatchesVerticesPair>		  m_numBatchesAndVerticesWithinWaves;
 	btOpenCLBuffer< NumBatchesVerticesPair >			  m_clNumBatchesAndVerticesWithinWaves;
 	// All arrays here will contain batches of m_maxLinksPerWavefront links
 	// ordered by wavefront.
 	// with either global vertex pairs or local vertex pairs
 	btAlignedObjectArray< int >							  m_wavefrontVerticesGlobalAddresses; // List of global vertices per wavefront
 	btOpenCLBuffer<int>									  m_clWavefrontVerticesGlobalAddresses;
 	btAlignedObjectArray< LinkNodePair >				  m_linkVerticesLocalAddresses; // Vertex pair for the link
 	btOpenCLBuffer<LinkNodePair>						  m_clLinkVerticesLocalAddresses;
 	btOpenCLBuffer<float>							      m_clLinkStrength;
 	btOpenCLBuffer<float>								  m_clLinksMassLSC;
 	btOpenCLBuffer<float>								  m_clLinksRestLengthSquared;
 	btOpenCLBuffer<float>								  m_clLinksRestLength;
 	btOpenCLBuffer<float>								  m_clLinksMaterialLinearStiffnessCoefficient;
 	struct BatchPair
 	{
 		int start;
 		int length;
 		BatchPair() :
 			start(0),
 			length(0)
 		{
 		}
 		BatchPair( int s, int l ) : 
 			start( s ),
 			length( l )
 		{
 		}
 	};
 	/**
 	 * Link addressing information for each cloth.
 	 * Allows link locations to be computed independently of data batching.
 	 */
 	btAlignedObjectArray< int >							m_linkAddresses;
 	/**
 	 * Start and length values for computation batches over link data.
 	 */
 	btAlignedObjectArray< BatchPair >		m_wavefrontBatchStartLengths;
 	btSoftBodyLinkDataOpenCLSIMDAware(cl_command_queue queue, cl_context ctx);
 	virtual ~btSoftBodyLinkDataOpenCLSIMDAware();
 	/** Allocate enough space in all link-related arrays to fit numLinks links */
 	virtual void createLinks( int numLinks );
 	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
 	virtual void setLinkAt( 
 		const LinkDescription &link, 
 		int linkIndex );
 	virtual bool onAccelerator();
 	virtual bool moveToAccelerator();
 	virtual bool moveFromAccelerator();
 	/**
 	 * Generate (and later update) the batching for the entire link set.
 	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
 	 * In theory we could delay it until just before we need the cloth.
 	 * It's a one-off overhead, though, so that is a later optimisation.
 	 */
 	void generateBatches();
 	int getMaxVerticesPerWavefront()
 	{
 		return m_maxVerticesWithinWave;
 	}
 	int getWavefrontSize()
 	{
 		return m_wavefrontSize;
 	}
 	int getLinksPerWorkItem()
 	{
 		return m_linksPerWorkItem;
 	}
 	int getMaxLinksPerWavefront()
 	{
 		return m_maxLinksPerWavefront;
 	}
 	int getMaxBatchesPerWavefront()
 	{
 		return m_maxBatchesWithinWave;
 	}
 	int getNumWavefronts()
 	{
 		return m_numWavefronts;
 	}
 	NumBatchesVerticesPair getNumBatchesAndVerticesWithinWavefront( int wavefront )
 	{
 		return m_numBatchesAndVerticesWithinWaves[wavefront];
 	}
 	int getVertexGlobalAddresses( int vertexIndex )
 	{
 		return m_wavefrontVerticesGlobalAddresses[vertexIndex];
 	}
 	/**
 	 * Get post-batching local addresses of the vertex pair for a link assuming all vertices used by a wavefront are loaded locally.
 	 */
 	LinkNodePair getVertexPairLocalAddresses( int linkIndex )
 	{
 		return m_linkVerticesLocalAddresses[linkIndex];
 	}
 };
 #endif // #ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_SIMDAWARE_H
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp
@@ -0,0 +1,133 @@
 #include "btSoftBodySolverOutputCLtoGL.h"
 #include <stdio.h> //@todo: remove the debugging printf at some stage
 #include "btSoftBodySolver_OpenCL.h"
 #include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
 #include "btSoftBodySolverVertexBuffer_OpenGL.h"
 #include "BulletSoftBody/btSoftBody.h"
 #if (0)//CL_VERSION_1_1 == 1)
 //OpenCL 1.1 kernels use float3
 #define MSTRINGIFY(A) #A
 static char* OutputToVertexArrayCLString =
 #include "OpenCLC/OutputToVertexArray.cl"
 #else
 ////OpenCL 1.0 kernels don't use float3
 #define MSTRINGIFY(A) #A
 static char* OutputToVertexArrayCLString =
 #include "OpenCLC10/OutputToVertexArray.cl"
 #endif //CL_VERSION_1_1
 #define RELEASE_CL_KERNEL(kernelName) {if( kernelName ){ clReleaseKernel( kernelName ); kernelName = 0; }}
 static const size_t workGroupSize = 128;
 void btSoftBodySolverOutputCLtoGL::copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer )
 {
 	btSoftBodySolver *solver = softBody->getSoftBodySolver();
 	btAssert( solver->getSolverType() == btSoftBodySolver::CL_SOLVER || solver->getSolverType() == btSoftBodySolver::CL_SIMD_SOLVER );
 	btOpenCLSoftBodySolver *dxSolver = static_cast< btOpenCLSoftBodySolver * >( solver );
 	checkInitialized();
 	btOpenCLAcceleratedSoftBodyInterface* currentCloth = dxSolver->findSoftBodyInterface( softBody );
 	btSoftBodyVertexDataOpenCL &vertexData( dxSolver->m_vertexData );	
 	const int firstVertex = currentCloth->getFirstVertex();
 	const int lastVertex = firstVertex + currentCloth->getNumVertices();
 	if( vertexBuffer->getBufferType() == btVertexBufferDescriptor::OPENGL_BUFFER ) {		
 		const btOpenGLInteropVertexBufferDescriptor *openGLVertexBuffer = static_cast< btOpenGLInteropVertexBufferDescriptor* >(vertexBuffer);						
 		cl_int ciErrNum = CL_SUCCESS;    
 		cl_mem clBuffer = openGLVertexBuffer->getBuffer();		
 		cl_kernel outputKernel = outputToVertexArrayWithNormalsKernel;
 		if( !vertexBuffer->hasNormals() )
 			outputKernel = outputToVertexArrayWithoutNormalsKernel;
 		ciErrNum = clEnqueueAcquireGLObjects(m_cqCommandQue, 1, &clBuffer, 0, 0, NULL);
 		if( ciErrNum != CL_SUCCESS )
 		{
 			btAssert( 0 &&  "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)");
 		}
 		int numVertices = currentCloth->getNumVertices();
 		ciErrNum = clSetKernelArg(outputKernel, 0, sizeof(int), &firstVertex );
 		ciErrNum = clSetKernelArg(outputKernel, 1, sizeof(int), &numVertices );
 		ciErrNum = clSetKernelArg(outputKernel, 2, sizeof(cl_mem), (void*)&clBuffer );
 		if( vertexBuffer->hasVertexPositions() )
 		{
 			int vertexOffset = vertexBuffer->getVertexOffset();
 			int vertexStride = vertexBuffer->getVertexStride();
 			ciErrNum = clSetKernelArg(outputKernel, 3, sizeof(int), &vertexOffset );
 			ciErrNum = clSetKernelArg(outputKernel, 4, sizeof(int), &vertexStride );
 			ciErrNum = clSetKernelArg(outputKernel, 5, sizeof(cl_mem), (void*)&vertexData.m_clVertexPosition.m_buffer );
 		}
 		if( vertexBuffer->hasNormals() )
 		{
 			int normalOffset = vertexBuffer->getNormalOffset();
 			int normalStride = vertexBuffer->getNormalStride();
 			ciErrNum = clSetKernelArg(outputKernel, 6, sizeof(int), &normalOffset );
 			ciErrNum = clSetKernelArg(outputKernel, 7, sizeof(int), &normalStride );
 			ciErrNum = clSetKernelArg(outputKernel, 8, sizeof(cl_mem), (void*)&vertexData.m_clVertexNormal.m_buffer );
 		}
 		size_t	numWorkItems = workGroupSize*((vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
 		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, outputKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
 		if( ciErrNum != CL_SUCCESS ) 
 		{
 			btAssert( 0 &&  "enqueueNDRangeKernel(copySoftBodyToVertexBuffer)");
 		}
 		ciErrNum = clEnqueueReleaseGLObjects(m_cqCommandQue, 1, &clBuffer, 0, 0, 0);
 		if( ciErrNum != CL_SUCCESS )
 		{
 			btAssert( 0 &&  "clEnqueueReleaseGLObjects(copySoftBodyToVertexBuffer)");
 		}
 	} else {
 		btAssert( "Undefined output for this solver output" == false );
 	}
 	// clFinish in here may not be the best thing. It's possible that we should have a waitForFrameComplete function.
 	clFinish(m_cqCommandQue);
 } // btSoftBodySolverOutputCLtoGL::outputToVertexBuffers
 bool btSoftBodySolverOutputCLtoGL::buildShaders()
 {
 	// Ensure current kernels are released first
 	releaseKernels();
 	bool returnVal = true;
 	if( m_shadersInitialized )
 		return true;
 	outputToVertexArrayWithNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithNormalsKernel" );
 	outputToVertexArrayWithoutNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithoutNormalsKernel" );
 	if( returnVal )
 		m_shadersInitialized = true;
 	return returnVal;
 } // btSoftBodySolverOutputCLtoGL::buildShaders
 void btSoftBodySolverOutputCLtoGL::releaseKernels()
 {
 	RELEASE_CL_KERNEL( outputToVertexArrayWithNormalsKernel );
 	RELEASE_CL_KERNEL( outputToVertexArrayWithoutNormalsKernel );
 	m_shadersInitialized = false;
 } // btSoftBodySolverOutputCLtoGL::releaseKernels
 bool btSoftBodySolverOutputCLtoGL::checkInitialized()
 {
 	if( !m_shadersInitialized )
 		if( buildShaders() )
 			m_shadersInitialized = true;
 	return m_shadersInitialized;
 }
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.h
@@ -0,0 +1,62 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #ifndef BT_SOFT_BODY_SOLVER_OUTPUT_CL_TO_GL_H
 #define BT_SOFT_BODY_SOLVER_OUTPUT_CL_TO_GL_H
 #include "btSoftBodySolver_OpenCL.h"
 /** 
 * Class to manage movement of data from a solver to a given target.
 * This version is the CL to GL interop version.
 */
 class btSoftBodySolverOutputCLtoGL : public btSoftBodySolverOutput
 {
 protected:
 	cl_command_queue	m_cqCommandQue;
 	cl_context			m_cxMainContext;
 	CLFunctions			clFunctions;
 	cl_kernel		outputToVertexArrayWithNormalsKernel;
 	cl_kernel		outputToVertexArrayWithoutNormalsKernel;
 	bool m_shadersInitialized;
 	virtual bool checkInitialized();	
 	virtual bool buildShaders();
 	void releaseKernels();
 public:
 	btSoftBodySolverOutputCLtoGL(cl_command_queue cqCommandQue, cl_context cxMainContext) :
 		m_cqCommandQue( cqCommandQue ),
 		m_cxMainContext( cxMainContext ),
 		clFunctions(cqCommandQue, cxMainContext),
 		outputToVertexArrayWithNormalsKernel( 0 ),
 		outputToVertexArrayWithoutNormalsKernel( 0 ),
 		m_shadersInitialized( false )
 	{
 	}
 	virtual ~btSoftBodySolverOutputCLtoGL()
 	{
 		releaseKernels();
 	}
 	/** Output current computed vertex data to the vertex buffers for all cloths in the solver. */
 	virtual void copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer );
 };
 #endif // #ifndef BT_SOFT_BODY_SOLVER_OUTPUT_CL_TO_GL_H
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexBuffer_OpenGL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexBuffer_OpenGL.h
@@ -0,0 +1,168 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #ifndef BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_OPENGL_H
 #define BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_OPENGL_H 
 #include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
 #ifdef USE_MINICL
 	#include "MiniCL/cl.h"
 #else //USE_MINICL
 	#ifdef __APPLE__
 		#include <OpenCL/OpenCL.h>
 	#else
 		#include <CL/cl.h>
 	#endif //__APPLE__
 #endif//USE_MINICL
 #ifndef USE_MINICL
 #include <CL/cl_gl.h>
 #endif //USE_MINICL
 #ifdef _WIN32//for glut.h
 #include <windows.h>
 #endif
 //think different
 #if defined(__APPLE__) && !defined (VMDMESA)
 #include <OpenGL/OpenGL.h>
 #include <OpenGL/gl.h>
 #include <OpenGL/glu.h>
 #include <GLUT/glut.h>
 #else
 #ifdef _WINDOWS
 #include <windows.h>
 #include <GL/gl.h>
 #include <GL/glu.h>
 #else
 #include <GL/glut.h>
 #endif //_WINDOWS
 #endif //APPLE
 class btOpenGLInteropVertexBufferDescriptor : public btVertexBufferDescriptor
 {
 protected:
 	/** OpenCL context */
 	cl_context			m_context;
 	/** OpenCL command queue */
 	cl_command_queue	m_commandQueue;
 	/** OpenCL interop buffer */
 	cl_mem m_buffer;
 	/** VBO in GL that is the basis of the interop buffer */
 	GLuint m_openGLVBO;
 public:
 	/**
 	 * context is the OpenCL context this interop buffer will work in.
 	 * queue is the command queue that kernels and data movement will be enqueued into.
 	 * openGLVBO is the OpenGL vertex buffer data will be copied into.
 	 * vertexOffset is the offset in floats to the first vertex.
 	 * vertexStride is the stride in floats between vertices.
 	 */
 	btOpenGLInteropVertexBufferDescriptor( cl_command_queue cqCommandQue, cl_context context, GLuint openGLVBO, int vertexOffset, int vertexStride )
 	{
 #ifndef USE_MINICL
 		cl_int ciErrNum = CL_SUCCESS;
 		m_context = context;
 		m_commandQueue = cqCommandQue;
 		m_vertexOffset = vertexOffset;
 		m_vertexStride = vertexStride;
 		m_openGLVBO = openGLVBO;
 		m_buffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, openGLVBO, &ciErrNum);
 		if( ciErrNum != CL_SUCCESS )
 		{
 			btAssert( 0 &&  "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)");
 		}
 		m_hasVertexPositions = true;
 #else
 		btAssert(0);//MiniCL shouldn't get here
 #endif
 	}
 	/**
 	 * context is the OpenCL context this interop buffer will work in.
 	 * queue is the command queue that kernels and data movement will be enqueued into.
 	 * openGLVBO is the OpenGL vertex buffer data will be copied into.
 	 * vertexOffset is the offset in floats to the first vertex.
 	 * vertexStride is the stride in floats between vertices.
 	 * normalOffset is the offset in floats to the first normal.
 	 * normalStride is the stride in floats between normals.
 	 */
 	btOpenGLInteropVertexBufferDescriptor( cl_command_queue cqCommandQue, cl_context context, GLuint openGLVBO, int vertexOffset, int vertexStride, int normalOffset, int normalStride )
 	{
 #ifndef USE_MINICL
 		cl_int ciErrNum = CL_SUCCESS;
 		m_context = context;
 		m_commandQueue = cqCommandQue;
 		m_openGLVBO = openGLVBO;
 		m_buffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, openGLVBO, &ciErrNum);
 		if( ciErrNum != CL_SUCCESS )
 		{
 			btAssert( 0 &&  "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)");
 		}
 		m_vertexOffset = vertexOffset;
 		m_vertexStride = vertexStride;
 		m_hasVertexPositions = true;
 		m_normalOffset = normalOffset;
 		m_normalStride = normalStride;
 		m_hasNormals = true;
 #else
 		btAssert(0);
 #endif //USE_MINICL
 	}
 	virtual ~btOpenGLInteropVertexBufferDescriptor()
 	{
 		clReleaseMemObject( m_buffer );
 	}
 	/**
 	 * Return the type of the vertex buffer descriptor.
 	 */
 	virtual BufferTypes getBufferType() const
 	{
 		return OPENGL_BUFFER;
 	}
 	virtual cl_context getContext() const
 	{
 		return m_context;
 	}
 	virtual cl_mem getBuffer() const
 	{
 		return m_buffer;
 	}	
 };
 #endif // #ifndef BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_OPENGL_H
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h
@@ -0,0 +1,82 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #ifndef BT_SOFT_BODY_SOLVER_OPENCL_SIMDAWARE_H
 #define BT_SOFT_BODY_SOLVER_OPENCL_SIMDAWARE_H
 #include "stddef.h" //for size_t
 #include "vectormath/vmInclude.h"
 #include "btSoftBodySolver_OpenCL.h"
 #include "btSoftBodySolverBuffer_OpenCL.h"
 #include "btSoftBodySolverLinkData_OpenCLSIMDAware.h"
 #include "btSoftBodySolverVertexData_OpenCL.h"
 #include "btSoftBodySolverTriangleData_OpenCL.h"
 class btOpenCLSoftBodySolverSIMDAware : public btOpenCLSoftBodySolver
 {
 protected:
 	btSoftBodyLinkDataOpenCLSIMDAware m_linkData;
 	bool m_shadersInitialized;
 	bool buildShaders();
 	void updateConstants( float timeStep );
 	float computeTriangleArea( 
 		const Vectormath::Aos::Point3 &vertex0,
 		const Vectormath::Aos::Point3 &vertex1,
 		const Vectormath::Aos::Point3 &vertex2 );
 	//////////////////////////////////////
 	// Kernel dispatches
 	void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
 	void solveCollisionsAndUpdateVelocities( float isolverdt );
 	// End kernel dispatches
 	/////////////////////////////////////
 public:
 	btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue,cl_context	ctx);
 	virtual ~btOpenCLSoftBodySolverSIMDAware();
 	virtual SolverTypes getSolverType() const
 	{
 		return CL_SIMD_SOLVER;
 	}
 	virtual btSoftBodyLinkData &getLinkData();
 	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);
 	virtual void solveConstraints( float solverdt );
 }; // btOpenCLSoftBodySolverSIMDAware
 #endif // #ifndef BT_SOFT_BODY_SOLVER_OPENCL_SIMDAWARE_H