Apply a patch that updates vertex position on GPU for the OpenCL version, by Dongsoo Han (saggita), work-in-progress

Removed the unused OpenCL kernels Add example how to cache binary kernels, see SoftDemo compiled with OpenCL AMD using msvc/vs2008_opencl.bat
2011-12-20 18:03:24 +00:00
parent 5cb40ed6e3
commit 1f7a67b7da
31 changed files with 441 additions and 1386 deletions
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverVertexData_DX11.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverVertexData_DX11.h
@@ -55,7 +55,7 @@ public:
 	virtual bool onAccelerator();
 	virtual bool moveToAccelerator();
-	virtual bool moveFromAccelerator();
+	virtual bool moveFromAccelerator(bool bCopy = false, bool bCopyMinimum = true);
 };
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
@@ -308,18 +308,42 @@ bool btSoftBodyVertexDataDX11::moveToAccelerator()
 	return success;
 }
-bool btSoftBodyVertexDataDX11::moveFromAccelerator()
+bool btSoftBodyVertexDataDX11::moveFromAccelerator(bool bCopy, bool bCopyMinimum)
 {
 	bool success = true;
-	success = success && m_dx11ClothIdentifier.moveFromGPU();
+
-	success = success && m_dx11VertexPosition.moveFromGPU();
+	if (!bCopy)
-	success = success && m_dx11VertexPreviousPosition.moveFromGPU();
+	{
-	success = success && m_dx11VertexVelocity.moveFromGPU();
+		success = success && m_dx11ClothIdentifier.moveFromGPU();
-	success = success && m_dx11VertexForceAccumulator.moveFromGPU();
+		success = success && m_dx11VertexPosition.moveFromGPU();
-	success = success && m_dx11VertexNormal.moveFromGPU();
+		success = success && m_dx11VertexPreviousPosition.moveFromGPU();
-	success = success && m_dx11VertexInverseMass.moveFromGPU();
+		success = success && m_dx11VertexVelocity.moveFromGPU();
-	success = success && m_dx11VertexArea.moveFromGPU();
+		success = success && m_dx11VertexForceAccumulator.moveFromGPU();
-	success = success && m_dx11VertexTriangleCount.moveFromGPU();
+		success = success && m_dx11VertexNormal.moveFromGPU();
 		success = success && m_dx11VertexInverseMass.moveFromGPU();
 		success = success && m_dx11VertexArea.moveFromGPU();
 		success = success && m_dx11VertexTriangleCount.moveFromGPU();
 	}
 	else
 	{
 		if (bCopyMinimum)
 		{
 			success = success && m_dx11VertexPosition.copyFromGPU();
 			success = success && m_dx11VertexNormal.copyFromGPU();
 		}
 		else
 		{
 			success = success && m_dx11ClothIdentifier.copyFromGPU();
 			success = success && m_dx11VertexPosition.copyFromGPU();
 			success = success && m_dx11VertexPreviousPosition.copyFromGPU();
 			success = success && m_dx11VertexVelocity.copyFromGPU();
 			success = success && m_dx11VertexForceAccumulator.copyFromGPU();
 			success = success && m_dx11VertexNormal.copyFromGPU();
 			success = success && m_dx11VertexInverseMass.copyFromGPU();
 			success = success && m_dx11VertexArea.copyFromGPU();
 			success = success && m_dx11VertexTriangleCount.copyFromGPU();
 		}
 	}
 	if( success )
 		m_onGPU = true;
@@ -619,10 +643,10 @@ void btDX11SoftBodySolver::releaseKernels()
 }
-void btDX11SoftBodySolver::copyBackToSoftBodies()
+void btDX11SoftBodySolver::copyBackToSoftBodies(bool bMove)
 {
 	// Move the vertex data back to the host first
-	m_vertexData.moveFromAccelerator();
+	m_vertexData.moveFromAccelerator(!bMove);
 	// Loop over soft bodies, copying all the vertex positions back for each body in turn
 	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
@@ -607,7 +607,7 @@ public:
 	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);
-	virtual void copyBackToSoftBodies();
+	virtual void copyBackToSoftBodies(bool bMove = true);
 	virtual void solveConstraints( float solverdt );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt
@@ -34,7 +34,6 @@ SET(BulletSoftBodyOpenCLSolvers_HDRS
 ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_AMD
 	${BulletSoftBodyOpenCLSolvers_SRCS} 
 	${BulletSoftBodyOpenCLSolvers_HDRS} 
 	${BulletSoftBodyOpenCLSolvers_OpenCLC}
 )
 SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES VERSION ${BULLET_VERSION})
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Intel/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Intel/CMakeLists.txt
@@ -45,7 +45,7 @@ SET(BulletSoftBodyOpenCLSolvers_Shaders
 )
 foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
-    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC/${f}.cl")
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC10/${f}.cl")
 endforeach(f) 
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
@@ -25,6 +25,7 @@ subject to the following restrictions:
 #include "../OpenCLC10/UpdatePositions.cl"
 #include "../OpenCLC10/UpdatePositionsFromVelocities.cl"
 #include "../OpenCLC10/VSolveLinks.cl"
 #include "../OpenCLC10/UpdateFixedVertexPositions.cl"
 //#include "../OpenCLC10/SolveCollisionsAndUpdateVelocities.cl"
@@ -39,7 +40,7 @@ MINICL_REGISTER(ApplyForcesKernel)
 MINICL_REGISTER(ResetNormalsAndAreasKernel)
 MINICL_REGISTER(NormalizeNormalsAndAreasKernel)
 MINICL_REGISTER(UpdateSoftBodiesKernel)
-
+MINICL_REGISTER(UpdateFixedVertexPositions)
 float mydot3a(float4 a, float4 b)
 {
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt
@@ -44,7 +44,7 @@ SET(BulletSoftBodyOpenCLSolvers_Shaders
 )
 foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
-    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC/${f}.cl")
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC10/${f}.cl")
 endforeach(f) 
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ApplyForces.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ApplyForces.cl
@@ -1,91 +0,0 @@
 MSTRINGIFY(
 /*#define float3 float4
 float dot3(float3 a, float3 b)
 {
   return a.x*b.x + a.y*b.y + a.z*b.z;
 }*/
 float3 projectOnAxis( float3 v, float3 a )
 {
 	return (a*dot(v, a));
 }
 __kernel void 
 ApplyForcesKernel(
 	const uint numNodes,
 	const float solverdt,
 	const float epsilon,
 	__global int * g_vertexClothIdentifier,
 	__global float4 * g_vertexNormal,
 	__global float * g_vertexArea,
 	__global float * g_vertexInverseMass,
 	__global float * g_clothLiftFactor,
 	__global float * g_clothDragFactor,
 	__global float4 * g_clothWindVelocity,
 	__global float4 * g_clothAcceleration,
 	__global float * g_clothMediumDensity,
 	__global float4 * g_vertexForceAccumulator,
 	__global float4 * g_vertexVelocity)
 {
 	unsigned int nodeID = get_global_id(0);
 	if( nodeID < numNodes )
 	{		
 		int clothId  = g_vertexClothIdentifier[nodeID];
 		float nodeIM = g_vertexInverseMass[nodeID];
 		if( nodeIM > 0.0f )
 		{
 			float3 nodeV  = g_vertexVelocity[nodeID].xyz;
 			float3 normal = g_vertexNormal[nodeID].xyz;
 			float area    = g_vertexArea[nodeID];
 			float3 nodeF  = g_vertexForceAccumulator[nodeID].xyz;
 			// Read per-cloth values
 			float3 clothAcceleration = g_clothAcceleration[clothId].xyz;
 			float3 clothWindVelocity = g_clothWindVelocity[clothId].xyz;
 			float liftFactor = g_clothLiftFactor[clothId];
 			float dragFactor = g_clothDragFactor[clothId];
 			float mediumDensity = g_clothMediumDensity[clothId];
 			// Apply the acceleration to the cloth rather than do this via a force
 			nodeV += (clothAcceleration*solverdt);
 			g_vertexVelocity[nodeID] = (float4)(nodeV, 0.f);
 			float3 relativeWindVelocity = nodeV - clothWindVelocity;
 			float relativeSpeedSquared = dot(relativeWindVelocity, relativeWindVelocity);
 			if( relativeSpeedSquared > epsilon )
 			{
 				// Correct direction of normal relative to wind direction and get dot product
 				normal = normal * (dot(normal, relativeWindVelocity) < 0 ? -1.f : 1.f);
 				float dvNormal = dot(normal, relativeWindVelocity);
 				if( dvNormal > 0 )
 				{
 					float3 force = (float3)(0.f, 0.f, 0.f);
 					float c0 = area * dvNormal * relativeSpeedSquared / 2.f;
 					float c1 = c0 * mediumDensity;
 					force += normal * (-c1 * liftFactor);
 					force += normalize(relativeWindVelocity)*(-c1 * dragFactor);
 					float dtim = solverdt * nodeIM;
 					float3 forceDTIM = force * dtim;
 					float3 nodeFPlusForce = nodeF + force;
 					// m_nodesf[i] -= ProjectOnAxis(m_nodesv[i], force.normalized())/dtim;	
 					float3 nodeFMinus = nodeF - (projectOnAxis(nodeV, normalize(force))/dtim);
 					nodeF = nodeFPlusForce;
 					if( dot(forceDTIM, forceDTIM) > dot(nodeV, nodeV) )
 						nodeF = nodeFMinus;
 					g_vertexForceAccumulator[nodeID] = (float4)(nodeF, 0.0f);	
 				}
 			}
 		}
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ComputeBounds.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ComputeBounds.cl
@@ -1,80 +0,0 @@
 MSTRINGIFY(
 #pragma OPENCL EXTENSION cl_amd_printf : enable \n
 __kernel void
 ComputeBoundsKernel( 
 	int numNodes,
 	int numSoftBodies,
 	__global int * g_vertexClothIdentifier,
 	__global float4 * g_vertexPositions,
 	volatile __global uint * g_clothMinBounds,
 	volatile __global uint * g_clothMaxBounds,
 	volatile __local uint * clothMinBounds,
 	volatile __local uint * clothMaxBounds)
 {
 	// Init min and max bounds arrays
 	if( get_local_id(0) < numSoftBodies )
 	{
 		clothMinBounds[get_local_id(0)*4] = UINT_MAX;
 		clothMinBounds[get_local_id(0)*4+1] = UINT_MAX;
 		clothMinBounds[get_local_id(0)*4+2] = UINT_MAX;
 		clothMinBounds[get_local_id(0)*4+3] = UINT_MAX;
 		clothMaxBounds[get_local_id(0)*4] = 0;
 		clothMaxBounds[get_local_id(0)*4+1] = 0;
 		clothMaxBounds[get_local_id(0)*4+2] = 0;
 		clothMaxBounds[get_local_id(0)*4+3] = 0;
 	}
 	barrier(CLK_GLOBAL_MEM_FENCE);
 	int nodeID = get_global_id(0);
 	if( nodeID < numNodes )
 	{	
 		int clothIdentifier = g_vertexClothIdentifier[get_global_id(0)];
 		if( clothIdentifier >= 0 )
 		{
 			float3 position = g_vertexPositions[get_global_id(0)].xyz;
 			/* Reinterpret position as uint */
 			uint3 positionUInt = (uint3)(as_uint(position.x), as_uint(position.y), as_uint(position.z));
 			/* Invert sign bit of positives and whole of negatives to allow comparison as unsigned ints */
 			positionUInt.x ^= (1+~(positionUInt.x >> 31) | 0x80000000);
 			positionUInt.y ^= (1+~(positionUInt.y >> 31) | 0x80000000);		
 			positionUInt.z ^= (1+~(positionUInt.z >> 31) | 0x80000000);
 			/* Min/max with the LDS values */
 			atomic_min(&(clothMinBounds[clothIdentifier*4]), positionUInt.x);
 			atomic_min(&(clothMinBounds[clothIdentifier*4+1]), positionUInt.y);
 			atomic_min(&(clothMinBounds[clothIdentifier*4+2]), positionUInt.z);
 			atomic_max(&(clothMaxBounds[clothIdentifier*4]), positionUInt.x);
 			atomic_max(&(clothMaxBounds[clothIdentifier*4+1]), positionUInt.y);
 			atomic_max(&(clothMaxBounds[clothIdentifier*4+2]), positionUInt.z);
 		}
 	}
 	barrier(CLK_GLOBAL_MEM_FENCE);
 	/* Use global atomics to update the global versions of the data*/
 	if( get_local_id(0) < numSoftBodies )
 	{
 		/*atomic_min(&(g_clothMinBounds[get_local_id(0)].x), clothMinBounds[get_local_id(0)].x);*/
 		atomic_min(&(g_clothMinBounds[get_local_id(0)*4]), clothMinBounds[get_local_id(0)*4]);
 		atomic_min(&(g_clothMinBounds[get_local_id(0)*4+1]), clothMinBounds[get_local_id(0)*4+1]);
 		atomic_min(&(g_clothMinBounds[get_local_id(0)*4+2]), clothMinBounds[get_local_id(0)*4+2]);
 		atomic_max(&(g_clothMaxBounds[get_local_id(0)*4]), clothMaxBounds[get_local_id(0)*4]);		
 		atomic_max(&(g_clothMaxBounds[get_local_id(0)*4+1]), clothMaxBounds[get_local_id(0)*4+1]);
 		atomic_max(&(g_clothMaxBounds[get_local_id(0)*4+2]), clothMaxBounds[get_local_id(0)*4+2]);
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/Integrate.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/Integrate.cl
@@ -1,35 +0,0 @@
 MSTRINGIFY(
 // Node indices for each link
 //#define float3 float4
 __kernel void
 IntegrateKernel( 
 	const int numNodes,
 	const float solverdt,
 	__global float * g_vertexInverseMasses,
 	__global float4 * g_vertexPositions,
 	__global float4 * g_vertexVelocity,
 	__global float4 * g_vertexPreviousPositions,
 	__global float4 * g_vertexForceAccumulator)
 {
 	int nodeID = get_global_id(0);
 	if( nodeID < numNodes )
 	{	
 		float3 position   = g_vertexPositions[nodeID].xyz;
 		float3 velocity   = g_vertexVelocity[nodeID].xyz;
 		float3 force      = g_vertexForceAccumulator[nodeID].xyz;
 		float inverseMass = g_vertexInverseMasses[nodeID];
 		g_vertexPreviousPositions[nodeID] = (float4)(position, 0.f);
 		velocity += force * inverseMass * solverdt;
 		position += velocity * solverdt;
 		g_vertexForceAccumulator[nodeID] = (float4)(0.f, 0.f, 0.f, 0.0f);
 		g_vertexPositions[nodeID]        = (float4)(position, 0.f);
 		g_vertexVelocity[nodeID]         = (float4)(velocity, 0.f);	
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/OutputToVertexArray.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/OutputToVertexArray.cl
@@ -1,46 +0,0 @@
 MSTRINGIFY(
 cbuffer OutputToVertexArrayCB : register( b0 )
 {
 	int startNode;
 	int numNodes;
 	int offsetX;
 	int strideX;
 	int offsetN;	
 	int strideN;
 	int padding1;
 	int padding2;
 };
 StructuredBuffer<float4> g_nodesx : register( t0 );
 StructuredBuffer<float4> g_nodesn : register( t1 );
 RWStructuredBuffer<float> g_vertexBuffer : register( u0 );
 [numthreads(128, 1, 1)]
 void 
 OutputToVertexArrayKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
 {
 	int nodeID = DTid.x;
 	if( nodeID < numNodes )
 	{			
 		float4 nodeX = g_nodesx[nodeID + startNode];
 		float4 nodeN = g_nodesn[nodeID + startNode];
 		// Stride should account for the float->float4 conversion
 		int positionDestination = nodeID * strideX + offsetX;		
 		g_vertexBuffer[positionDestination] = nodeX.x;
 		g_vertexBuffer[positionDestination+1] = nodeX.y;
 		g_vertexBuffer[positionDestination+2] = nodeX.z;
 		int normalDestination = nodeID * strideN + offsetN;
 		g_vertexBuffer[normalDestination] = nodeN.x;
 		g_vertexBuffer[normalDestination+1] = nodeN.y;
 		g_vertexBuffer[normalDestination+2] = nodeN.z;		
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/PrepareLinks.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/PrepareLinks.cl
@@ -1,34 +0,0 @@
 MSTRINGIFY(
 __kernel void 
 PrepareLinksKernel( 
 	const int numLinks,
 	__global int2 * g_linksVertexIndices,
 	__global float * g_linksMassLSC,
 	__global float4 * g_nodesPreviousPosition,
 	__global float * g_linksLengthRatio,
 	__global float4 * g_linksCurrentLength)
 {
 	int linkID = get_global_id(0);
 	if( linkID < numLinks )
 	{	
 		int2 nodeIndices = g_linksVertexIndices[linkID];
 		int node0 = nodeIndices.x;
 		int node1 = nodeIndices.y;
 		float4 nodePreviousPosition0 = g_nodesPreviousPosition[node0];
 		float4 nodePreviousPosition1 = g_nodesPreviousPosition[node1];
 		float massLSC = g_linksMassLSC[linkID];
 		float4 linkCurrentLength = nodePreviousPosition1 - nodePreviousPosition0;
 		float linkLengthRatio = dot(linkCurrentLength, linkCurrentLength)*massLSC;
 		linkLengthRatio = 1.0f/linkLengthRatio;
 		g_linksCurrentLength[linkID] = linkCurrentLength;
 		g_linksLengthRatio[linkID]   = linkLengthRatio;		
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolveCollisionsAndUpdateVelocities.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolveCollisionsAndUpdateVelocities.cl
@@ -1,195 +0,0 @@
 MSTRINGIFY(
 typedef struct 
 {
 	int firstObject;
 	int endObject;
 } CollisionObjectIndices;
 typedef struct 
 {
 	float4 shapeTransform[4]; // column major 4x4 matrix
 	float4 linearVelocity;
 	float4 angularVelocity;
 	int softBodyIdentifier;
 	int collisionShapeType;
 	// Shape information
 	// Compressed from the union
 	float radius;
 	float halfHeight;
 	int upAxis;
 	float margin;
 	float friction;
 	int padding0;
 } CollisionShapeDescription;
 /* From btBroadphaseProxy.h */
 __constant int CAPSULE_SHAPE_PROXYTYPE = 10;
 /* Multiply column-major matrix against vector */
 float4 matrixVectorMul( float4 matrix[4], float4 vector )
 {
 	float4 returnVector;
 	float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
 	float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
 	float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
 	float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
 	returnVector.x = dot(row0, vector);
 	returnVector.y = dot(row1, vector);
 	returnVector.z = dot(row2, vector);
 	returnVector.w = dot(row3, vector);
 	return returnVector;
 }
 __kernel void 
 SolveCollisionsAndUpdateVelocitiesKernel( 
 	const int numNodes,
 	const float isolverdt,
 	__global int *g_vertexClothIdentifier,
 	__global float4 *g_vertexPreviousPositions,
 	__global float * g_perClothFriction,
 	__global float * g_clothDampingFactor,
 	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
 	__global CollisionShapeDescription * g_collisionObjectDetails,
 	__global float4 * g_vertexForces,
 	__global float4 *g_vertexVelocities,
 	__global float4 *g_vertexPositions)
 {
 	int nodeID = get_global_id(0);
 	float3 forceOnVertex = (float3)(0.f, 0.f, 0.f);
 	if( get_global_id(0) < numNodes )
 	{	
 		int clothIdentifier = g_vertexClothIdentifier[nodeID];
 		// Abort if this is not a valid cloth
 		if( clothIdentifier < 0 )
 			return;
 		float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f);
 		float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f);
 		float3 velocity;
 		float clothFriction = g_perClothFriction[clothIdentifier];
 		float dampingFactor = g_clothDampingFactor[clothIdentifier];
 		float velocityCoefficient = (1.f - dampingFactor);		
 		CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
 		if( collisionObjectIndices.firstObject != collisionObjectIndices.endObject )
 		{
 			velocity = (float3)(15, 0, 0);
 			/* We have some possible collisions to deal with */
 			for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision )
 			{
 				CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision];
 				float colliderFriction = shapeDescription.friction;
 				if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
 				{
 					/* Colliding with a capsule */
 					float capsuleHalfHeight = shapeDescription.halfHeight;
 					float capsuleRadius = shapeDescription.radius;
 					float capsuleMargin = shapeDescription.margin;
 					int capsuleupAxis = shapeDescription.upAxis;
 					/* Four columns of worldTransform matrix */
 					float4 worldTransform[4];
 					worldTransform[0] = shapeDescription.shapeTransform[0];
 					worldTransform[1] = shapeDescription.shapeTransform[1];
 					worldTransform[2] = shapeDescription.shapeTransform[2];
 					worldTransform[3] = shapeDescription.shapeTransform[3];
 					// Correctly define capsule centerline vector 
 					float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); 
 					float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f);
 					c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
 					c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
 					c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
 					c2.x = -c1.x;
 					c2.y = -c1.y;
 					c2.z = -c1.z;
 					float4 worldC1 = matrixVectorMul(worldTransform, c1);
 					float4 worldC2 = matrixVectorMul(worldTransform, c2);
 					float3 segment = (worldC2 - worldC1).xyz;
 					/* compute distance of tangent to vertex along line segment in capsule */
 					float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) );
 					float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment, 0.f));
 					float distanceFromLine = length(position - closestPoint);
 					float distanceFromC1 = length(worldC1 - position);
 					float distanceFromC2 = length(worldC2 - position);
 					/* Final distance from collision, point to push from, direction to push in
 					   for impulse force */
 					float dist;
 					float3 normalVector;
 					if( distanceAlongSegment < 0 )
 					{
 						dist = distanceFromC1;
 						normalVector = normalize(position - worldC1).xyz;
 					} else if( distanceAlongSegment > 1.f ) {
 						dist = distanceFromC2;
 						normalVector = normalize(position - worldC2).xyz;	
 					} else {
 						dist = distanceFromLine;
 						normalVector = normalize(position - closestPoint).xyz;
 					}
 					float3 colliderLinearVelocity = shapeDescription.linearVelocity.xyz;
 					float3 colliderAngularVelocity = shapeDescription.angularVelocity.xyz;
 					float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz - (float3)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w));
 					float minDistance = capsuleRadius + capsuleMargin;
 					/* In case of no collision, this is the value of velocity */
 					velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
 					// Check for a collision
 					if( dist < minDistance )
 					{
 						/* Project back to surface along normal */
 						position = position + (float4)((minDistance - dist)*normalVector*0.9f, 0.f);
 						velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
 						float3 relativeVelocity = velocity - velocityOfSurfacePoint;
 						float3 p1 = normalize(cross(normalVector, segment));
 						float3 p2 = normalize(cross(p1, normalVector));
 						/* Full friction is sum of velocities in each direction of plane */
 						float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2);
 						/* Real friction is peak friction corrected by friction coefficients */
 						frictionVector = frictionVector * (colliderFriction*clothFriction);
 						float approachSpeed = dot(relativeVelocity, normalVector);
 						if( approachSpeed <= 0.0f )
 							forceOnVertex -= frictionVector;
 					}
 				}
 			}
 		} else {
 			/* Update velocity	 */
 			float3 difference = position.xyz - previousPosition.xyz;
 			velocity = difference*velocityCoefficient*isolverdt;			
 		}
 		g_vertexVelocities[nodeID] = (float4)(velocity, 0.f);	
 		/* Update external force */
 		g_vertexForces[nodeID] = (float4)(forceOnVertex, 0.f);
 		g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f);
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl
@@ -1,213 +0,0 @@
 MSTRINGIFY(
 typedef struct 
 {
 	int firstObject;
 	int endObject;
 } CollisionObjectIndices;
 typedef struct 
 {
 	float4 shapeTransform[4]; /* column major 4x4 matrix */
 	float4 linearVelocity;
 	float4 angularVelocity;
 	int softBodyIdentifier;
 	int collisionShapeType;
 	// Shape information
 	// Compressed from the union
 	float radius;
 	float halfHeight;
 	int upAxis;
 	float margin;
 	float friction;
 	int padding0;
 } CollisionShapeDescription;
 /* From btBroadphaseProxy.h */
 __constant int CAPSULE_SHAPE_PROXYTYPE = 10;
 /* Multiply column-major matrix against vector */
 float4 matrixVectorMul( float4 matrix[4], float4 vector )
 {
 	float4 returnVector;
 	float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
 	float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
 	float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
 	float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
 	returnVector.x = dot(row0, vector);
 	returnVector.y = dot(row1, vector);
 	returnVector.z = dot(row2, vector);
 	returnVector.w = dot(row3, vector);
 	return returnVector;
 }
 __kernel void 
 SolveCollisionsAndUpdateVelocitiesKernel( 
 	const int numNodes,
 	const float isolverdt,
 	__global int *g_vertexClothIdentifier,
 	__global float4 *g_vertexPreviousPositions,
 	__global float * g_perClothFriction,
 	__global float * g_clothDampingFactor,
 	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
 	__global CollisionShapeDescription * g_collisionObjectDetails,
 	__global float4 * g_vertexForces,
 	__global float4 *g_vertexVelocities,
 	__global float4 *g_vertexPositions,
 	__local CollisionShapeDescription *localCollisionShapes)
 {
 	int nodeID = get_global_id(0);
 	float3 forceOnVertex = (float3)(0.f, 0.f, 0.f);
 	int clothIdentifier = g_vertexClothIdentifier[nodeID];
 	// Abort if this is not a valid cloth
 	if( clothIdentifier < 0 )
 		return;
 	float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f);
 	float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f);
 	float3 velocity;
 	float clothFriction = g_perClothFriction[clothIdentifier];
 	float dampingFactor = g_clothDampingFactor[clothIdentifier];
 	float velocityCoefficient = (1.f - dampingFactor);		
 	CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
 	int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
 	if( numObjects > 0 )
 	{
 		/* We have some possible collisions to deal with */
 		/* First load all of the collision objects into LDS */
 		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
 		if( get_local_id(0) < numObjects )
 		{
 			localCollisionShapes[get_local_id(0)] = g_collisionObjectDetails[ collisionObjectIndices.firstObject + get_local_id(0) ];
 		}
 	}
 	/* Safe as the vertices are padded so that not more than one soft body is in a group */
 	barrier(CLK_LOCAL_MEM_FENCE);
 	/* Annoyingly, even though I know the flow control is not varying, the compiler will not let me skip this */
 	if( numObjects > 0 )
 	{
 		velocity = (float3)(0, 0, 0);
 		// We have some possible collisions to deal with
 		for( int collision = 0; collision < numObjects; ++collision )
 		{
 			CollisionShapeDescription shapeDescription = localCollisionShapes[collision];
 			float colliderFriction = shapeDescription.friction;
 			if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
 			{
 				/* Colliding with a capsule */
 				float capsuleHalfHeight = localCollisionShapes[collision].halfHeight;
 				float capsuleRadius = localCollisionShapes[collision].radius;
 				float capsuleMargin = localCollisionShapes[collision].margin;
 				int capsuleupAxis = localCollisionShapes[collision].upAxis;
 				float4 worldTransform[4];
 				worldTransform[0] = localCollisionShapes[collision].shapeTransform[0];
 				worldTransform[1] = localCollisionShapes[collision].shapeTransform[1];
 				worldTransform[2] = localCollisionShapes[collision].shapeTransform[2];
 				worldTransform[3] = localCollisionShapes[collision].shapeTransform[3];
 				// Correctly define capsule centerline vector 
 				float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); 
 				float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f);
 				c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
 				c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
 				c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
 				c2.x = -c1.x;
 				c2.y = -c1.y;
 				c2.z = -c1.z;
 				float4 worldC1 = matrixVectorMul(worldTransform, c1);
 				float4 worldC2 = matrixVectorMul(worldTransform, c2);
 				float3 segment = (worldC2 - worldC1).xyz;
 				/* compute distance of tangent to vertex along line segment in capsule */
 				float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) );
 				float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment, 0.f));
 				float distanceFromLine = length(position - closestPoint);
 				float distanceFromC1 = length(worldC1 - position);
 				float distanceFromC2 = length(worldC2 - position);
 				/* Final distance from collision, point to push from, direction to push in
 				   for impulse force */
 				float dist;
 				float3 normalVector;
 				if( distanceAlongSegment < 0 )
 				{
 					dist = distanceFromC1;
 					normalVector = normalize(position - worldC1).xyz;
 				} else if( distanceAlongSegment > 1.f ) {
 					dist = distanceFromC2;
 					normalVector = normalize(position - worldC2).xyz;	
 				} else {
 					dist = distanceFromLine;
 					normalVector = normalize(position - closestPoint).xyz;
 				}
 				float3 colliderLinearVelocity = localCollisionShapes[collision].linearVelocity.xyz;
 				float3 colliderAngularVelocity = localCollisionShapes[collision].angularVelocity.xyz;
 				float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz -  (float3)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w));
 				float minDistance = capsuleRadius + capsuleMargin;
 				/* In case of no collision, this is the value of velocity */
 				velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
 				/* Check for a collision */
 				if( dist < minDistance )
 				{
 					/* Project back to surface along normal */
 					position = position + (float4)((minDistance - dist)*normalVector*0.9f, 0.f);
 					velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
 					float3 relativeVelocity = velocity - velocityOfSurfacePoint;
 					float3 p1 = normalize(cross(normalVector, segment));
 					float3 p2 = normalize(cross(p1, normalVector));
 					/* Full friction is sum of velocities in each direction of plane */
 					float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2);
 					/* Real friction is peak friction corrected by friction coefficients */
 					frictionVector = frictionVector * (colliderFriction*clothFriction);
 					float approachSpeed = dot(relativeVelocity, normalVector);
 					if( approachSpeed <= 0.0f )
 						forceOnVertex -= frictionVector;
 				}
 			}
 		}
 	} else {
 		/* Update velocity	*/
 		float3 difference = position.xyz - previousPosition.xyz;
 		velocity = difference*velocityCoefficient*isolverdt;			
 	}
 	g_vertexVelocities[nodeID] = (float4)(velocity, 0.f);	
 	/* Update external force */
 	g_vertexForces[nodeID] = (float4)(forceOnVertex, 0.f);
 	g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f);
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositions.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositions.cl
@@ -1,55 +0,0 @@
 MSTRINGIFY(
 /*#define float3 float4
 float dot3(float3 a, float3 b)
 {
   return a.x*b.x + a.y*b.y + a.z*b.z;
 }*/
 __kernel void 
 SolvePositionsFromLinksKernel( 
 	const int startLink,
 	const int numLinks,
 	const float kst,
 	const float ti,
 	__global int2 * g_linksVertexIndices,
 	__global float * g_linksMassLSC,
 	__global float * g_linksRestLengthSquared,
 	__global float * g_verticesInverseMass,
 	__global float4 * g_vertexPositions)
 {
 	int linkID = get_global_id(0) + startLink;
 	if( get_global_id(0) < numLinks )
 	{	
 		float massLSC = g_linksMassLSC[linkID];
 		float restLengthSquared = g_linksRestLengthSquared[linkID];
 		if( massLSC > 0.0f )
 		{		
 			int2 nodeIndices = g_linksVertexIndices[linkID];
 			int node0 = nodeIndices.x;
 			int node1 = nodeIndices.y;
 			float3 position0 = g_vertexPositions[node0].xyz;
 			float3 position1 = g_vertexPositions[node1].xyz;
 			float inverseMass0 = g_verticesInverseMass[node0];
 			float inverseMass1 = g_verticesInverseMass[node1]; 
 			float3 del = position1 - position0;
 			float len  = dot(del, del);
 			float k    = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
 			position0 = position0 - del*(k*inverseMass0);
 			position1 = position1 + del*(k*inverseMass1);
 			g_vertexPositions[node0] = (float4)(position0, 0.f);
 			g_vertexPositions[node1] = (float4)(position1, 0.f);
 		}
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositionsSIMDBatched.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositionsSIMDBatched.cl
@@ -1,129 +0,0 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 MSTRINGIFY(
 float mydot3(float4 a, float4 b)
 {
   return a.x*b.x + a.y*b.y + a.z*b.z;
 }
 __kernel void 
 SolvePositionsFromLinksKernel( 
 	const int startWaveInBatch,
 	const int numWaves,
 	const float kst,
 	const float ti,
 	__global int2 *g_wavefrontBatchCountsVertexCounts,
 	__global int *g_vertexAddressesPerWavefront,
 	__global int2 * g_linksVertexIndices,
 	__global float * g_linksMassLSC,
 	__global float * g_linksRestLengthSquared,
 	__global float * g_verticesInverseMass,
 	__global float4 * g_vertexPositions,
 	__local int2 *wavefrontBatchCountsVertexCounts,
 	__local float4 *vertexPositionSharedData,
 	__local float *vertexInverseMassSharedData)
 {
 	const int laneInWavefront = (get_global_id(0) & (WAVEFRONT_SIZE-1));
 	const int wavefront = startWaveInBatch + (get_global_id(0) / WAVEFRONT_SIZE);
 	const int firstWavefrontInBlock = startWaveInBatch + get_group_id(0) * WAVEFRONT_BLOCK_MULTIPLIER;
 	const int localWavefront = wavefront - firstWavefrontInBlock;
 	// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier	
 	if( wavefront < (startWaveInBatch + numWaves) )
 	{	
 		// Load the batch counts for the wavefronts
 		int2 batchesAndVerticesWithinWavefront = g_wavefrontBatchCountsVertexCounts[wavefront];
 		int batchesWithinWavefront = batchesAndVerticesWithinWavefront.x;
 		int verticesUsedByWave = batchesAndVerticesWithinWavefront.y;
 		// Load the vertices for the wavefronts
 		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
 		{
 			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
 			vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_vertexPositions[vertexAddress];
 			vertexInverseMassSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_verticesInverseMass[vertexAddress];
 		}
 		mem_fence(CLK_LOCAL_MEM_FENCE);
 		// Loop through the batches performing the solve on each in LDS
 		int baseDataLocationForWave = WAVEFRONT_SIZE * wavefront * MAX_BATCHES_PER_WAVE;	
 		//for( int batch = 0; batch < batchesWithinWavefront; ++batch )
 		int batch = 0;
 		do
 		{
 			int baseDataLocation = baseDataLocationForWave + WAVEFRONT_SIZE * batch;
 			int locationOfValue = baseDataLocation + laneInWavefront;
 			// These loads should all be perfectly linear across the WF
 			int2 localVertexIndices = g_linksVertexIndices[locationOfValue];
 			float massLSC = g_linksMassLSC[locationOfValue];
 			float restLengthSquared = g_linksRestLengthSquared[locationOfValue];
 			// LDS vertex addresses based on logical wavefront number in block and loaded index
 			int vertexAddress0 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.x;
 			int vertexAddress1 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.y;
 			float4 position0 = vertexPositionSharedData[vertexAddress0];
 			float4 position1 = vertexPositionSharedData[vertexAddress1];
 			float inverseMass0 = vertexInverseMassSharedData[vertexAddress0];
 			float inverseMass1 = vertexInverseMassSharedData[vertexAddress1]; 
 			float4 del = position1 - position0;
 			float len = mydot3(del, del);
 			float k = 0;
 			if( massLSC > 0.0f )
 			{		
 				k = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
 			}
 			position0 = position0 - del*(k*inverseMass0);
 			position1 = position1 + del*(k*inverseMass1);
 			// Ensure compiler does not re-order memory operations
 			mem_fence(CLK_LOCAL_MEM_FENCE);
 			vertexPositionSharedData[vertexAddress0] = position0;
 			vertexPositionSharedData[vertexAddress1] = position1;
 			// Ensure compiler does not re-order memory operations
 			mem_fence(CLK_LOCAL_MEM_FENCE);
 			++batch;
 		} while( batch < batchesWithinWavefront );
 		// Update the global memory vertices for the wavefronts
 		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
 		{
 			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
 			g_vertexPositions[vertexAddress] = (float4)(vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex].xyz, 0.f);
 		}		
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdateConstants.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdateConstants.cl
@@ -1,44 +0,0 @@
 MSTRINGIFY(
 /*#define float3 float4
 float dot3(float3 a, float3 b)
 {
   return a.x*b.x + a.y*b.y + a.z*b.z;
 }*/
 __kernel void 
 UpdateConstantsKernel( 
 	const int numLinks,
 	__global int2 * g_linksVertexIndices,
 	__global float4 * g_vertexPositions,
 	__global float * g_vertexInverseMasses,
 	__global float * g_linksMaterialLSC,
 	__global float * g_linksMassLSC,
 	__global float * g_linksRestLengthSquared,
 	__global float * g_linksRestLengths)
 {
 	int linkID = get_global_id(0);
 	if( linkID < numLinks )
 	{	
 		int2 nodeIndices = g_linksVertexIndices[linkID];
 		int node0 = nodeIndices.x;
 		int node1 = nodeIndices.y;
 		float linearStiffnessCoefficient = g_linksMaterialLSC[ linkID ];
 		float3 position0   = g_vertexPositions[node0].xyz;
 		float3 position1   = g_vertexPositions[node1].xyz;
 		float inverseMass0 = g_vertexInverseMasses[node0];
 		float inverseMass1 = g_vertexInverseMasses[node1];
 		float3 difference = position0 - position1;
 		float length2 = dot(difference, difference);
 		float length = sqrt(length2);
 		g_linksRestLengths[linkID] = length;
 		g_linksMassLSC[linkID] = (inverseMass0 + inverseMass1)/linearStiffnessCoefficient;
 		g_linksRestLengthSquared[linkID] = length*length;		
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdateNodes.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdateNodes.cl
@@ -1,40 +0,0 @@
 MSTRINGIFY(
 //#define float3 float4
 __kernel void 
 updateVelocitiesFromPositionsWithVelocitiesKernel( 
 	int numNodes,
 	float isolverdt,
 	__global float4 * g_vertexPositions,
 	__global float4 * g_vertexPreviousPositions,
 	__global int * g_vertexClothIndices,
 	__global float *g_clothVelocityCorrectionCoefficients,
 	__global float * g_clothDampingFactor,
 	__global float4 * g_vertexVelocities,
 	__global float4 * g_vertexForces)
 {
 	int nodeID = get_global_id(0);
 	if( nodeID < numNodes )
 	{	
 		float3 position = g_vertexPositions[nodeID].xyz;
 		float3 previousPosition = g_vertexPreviousPositions[nodeID].xyz;
 		float3 velocity = g_vertexVelocities[nodeID].xyz;
 		int clothIndex = g_vertexClothIndices[nodeID];
 		float velocityCorrectionCoefficient = g_clothVelocityCorrectionCoefficients[clothIndex];
 		float dampingFactor = g_clothDampingFactor[clothIndex];
 		float velocityCoefficient = (1.f - dampingFactor);
 		float3 difference = position - previousPosition;
 		velocity += difference*velocityCorrectionCoefficient*isolverdt;
 		// Damp the velocity
 		velocity *= velocityCoefficient;
 		g_vertexVelocities[nodeID] = (float4)(velocity, 0.f);
 		g_vertexForces[nodeID] = (float4)(0.f, 0.f, 0.f, 0.f);								
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdateNormals.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdateNormals.cl
@@ -1,103 +0,0 @@
 MSTRINGIFY(
 //#define float3 float4
 /*float length3(float3 a)
 {
 	a.w = 0;
 	return length(a);
 }
 float normalize3(float3 a)
 {
 	a.w = 0;
 	return normalize(a);
 }*/
 __kernel void 
 ResetNormalsAndAreasKernel(
 	const unsigned int numNodes,
 	__global float4 * g_vertexNormals,
 	__global float * g_vertexArea)
 {
 	if( get_global_id(0) < numNodes )
 	{
 		g_vertexNormals[get_global_id(0)] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
 		g_vertexArea[get_global_id(0)]    = 0.0f;
 	}
 }
 __kernel void 
 UpdateSoftBodiesKernel(
 	const unsigned int startFace,
 	const unsigned int numFaces,
 	__global int4 * g_triangleVertexIndexSet,
 	__global float4 * g_vertexPositions,
 	__global float4 * g_vertexNormals,
 	__global float * g_vertexArea,
 	__global float4 * g_triangleNormals,
 	__global float * g_triangleArea)
 {
 	int faceID = get_global_id(0) + startFace;
 	if( get_global_id(0) < numFaces )
 	{		
 		int4 triangleIndexSet = g_triangleVertexIndexSet[ faceID ];
 		int nodeIndex0 = triangleIndexSet.x;
 		int nodeIndex1 = triangleIndexSet.y;
 		int nodeIndex2 = triangleIndexSet.z;
 		float3 node0 = g_vertexPositions[nodeIndex0].xyz;
 		float3 node1 = g_vertexPositions[nodeIndex1].xyz;
 		float3 node2 = g_vertexPositions[nodeIndex2].xyz;
 		float3 nodeNormal0 = g_vertexNormals[nodeIndex0].xyz;
 		float3 nodeNormal1 = g_vertexNormals[nodeIndex1].xyz;
 		float3 nodeNormal2 = g_vertexNormals[nodeIndex2].xyz;
 		float vertexArea0 = g_vertexArea[nodeIndex0];
 		float vertexArea1 = g_vertexArea[nodeIndex1];
 		float vertexArea2 = g_vertexArea[nodeIndex2];
 		float3 vector0 = node1 - node0;
 		float3 vector1 = node2 - node0;
 		float3 faceNormal = cross(vector0.xyz, vector1.xyz);
 		float triangleArea = length(faceNormal);
 		nodeNormal0 = nodeNormal0 + faceNormal;
 		nodeNormal1 = nodeNormal1 + faceNormal;
 		nodeNormal2 = nodeNormal2 + faceNormal;
 		vertexArea0 = vertexArea0 + triangleArea;
 		vertexArea1 = vertexArea1 + triangleArea;
 		vertexArea2 = vertexArea2 + triangleArea;
 		g_triangleNormals[faceID] = (float4)(normalize(faceNormal), 0.f);
 		g_vertexNormals[nodeIndex0] = (float4)(nodeNormal0, 0.f);
 		g_vertexNormals[nodeIndex1] = (float4)(nodeNormal1, 0.f);
 		g_vertexNormals[nodeIndex2] = (float4)(nodeNormal2, 0.f);
 		g_triangleArea[faceID] = triangleArea;
 		g_vertexArea[nodeIndex0] = vertexArea0;
 		g_vertexArea[nodeIndex1] = vertexArea1;
 		g_vertexArea[nodeIndex2] = vertexArea2;
 	}
 }
 __kernel void 
 NormalizeNormalsAndAreasKernel( 
 	const unsigned int numNodes,
 	__global int * g_vertexTriangleCount,
 	__global float4 * g_vertexNormals,
 	__global float * g_vertexArea)
 {
 	if( get_global_id(0) < numNodes )
 	{
 		float4 normal = g_vertexNormals[get_global_id(0)];
 		float area = g_vertexArea[get_global_id(0)];
 		int numTriangles = g_vertexTriangleCount[get_global_id(0)];
 		float vectorLength = length(normal);
 		g_vertexNormals[get_global_id(0)] = normalize(normal);
 		g_vertexArea[get_global_id(0)] = area/(float)(numTriangles);
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdatePositions.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdatePositions.cl
@@ -1,36 +0,0 @@
 MSTRINGIFY(
 //#define float3 float4
 __kernel void 
 updateVelocitiesFromPositionsWithoutVelocitiesKernel( 
 	const int numNodes,
 	const float isolverdt,
 	__global float4 * g_vertexPositions,
 	__global float4 * g_vertexPreviousPositions,
 	__global int * g_vertexClothIndices,
 	__global float * g_clothDampingFactor,
 	__global float4 * g_vertexVelocities,
 	__global float4 * g_vertexForces)
 {
 	int nodeID = get_global_id(0);
 	if( nodeID < numNodes )
 	{	
 		float3 position = g_vertexPositions[nodeID].xyz;
 		float3 previousPosition = g_vertexPreviousPositions[nodeID].xyz;
 		float3 velocity = g_vertexVelocities[nodeID].xyz;
 		int clothIndex = g_vertexClothIndices[nodeID];
 		float dampingFactor = g_clothDampingFactor[clothIndex];
 		float velocityCoefficient = (1.f - dampingFactor);
 		float3 difference = position - previousPosition;
 		velocity = difference*velocityCoefficient*isolverdt;		
 		g_vertexVelocities[nodeID] = (float4)(velocity, 0.f);
 		g_vertexForces[nodeID] = (float4)(0.f, 0.f, 0.f, 0.f);								
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdatePositionsFromVelocities.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdatePositionsFromVelocities.cl
@@ -1,26 +0,0 @@
 MSTRINGIFY(
 //#define float3 float4
 __kernel void 
 UpdatePositionsFromVelocitiesKernel( 
 	const int numNodes,
 	const float solverSDT,
 	__global float4 * g_vertexVelocities,
 	__global float4 * g_vertexPreviousPositions,
 	__global float4 * g_vertexCurrentPosition)
 {
 	int vertexID = get_global_id(0);
 	if( vertexID < numNodes )
 	{	
 		float3 previousPosition = g_vertexPreviousPositions[vertexID].xyz;
 		float3 velocity         = g_vertexVelocities[vertexID].xyz;
 		float3 newPosition      = previousPosition + velocity*solverSDT;
 		g_vertexCurrentPosition[vertexID]   = (float4)(newPosition, 0.f);
 		g_vertexPreviousPositions[vertexID] = (float4)(newPosition, 0.f);
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/VSolveLinks.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/VSolveLinks.cl
@@ -1,45 +0,0 @@
 MSTRINGIFY(
 __kernel void 
 VSolveLinksKernel( 
 	int startLink,
 	int numLinks,
 	float kst,
 	__global int2 * g_linksVertexIndices,
 	__global float * g_linksLengthRatio,
 	__global float4 * g_linksCurrentLength,
 	__global float * g_vertexInverseMass,
 	__global float4 * g_vertexVelocity)
 {
 	int linkID = get_global_id(0) + startLink;
 	if( get_global_id(0) < numLinks )
 	{		
 		int2 nodeIndices = g_linksVertexIndices[linkID];
 		int node0 = nodeIndices.x;
 		int node1 = nodeIndices.y;
 		float linkLengthRatio = g_linksLengthRatio[linkID];
 		float3 linkCurrentLength = g_linksCurrentLength[linkID].xyz;
 		float3 vertexVelocity0 = g_vertexVelocity[node0].xyz;
 		float3 vertexVelocity1 = g_vertexVelocity[node1].xyz;
 		float vertexInverseMass0 = g_vertexInverseMass[node0];
 		float vertexInverseMass1 = g_vertexInverseMass[node1]; 
 		float3 nodeDifference = vertexVelocity0 - vertexVelocity1;
 		float dotResult = dot(linkCurrentLength, nodeDifference);
 		float j = -dotResult*linkLengthRatio*kst;
 		float3 velocityChange0 = linkCurrentLength*(j*vertexInverseMass0);
 		float3 velocityChange1 = linkCurrentLength*(j*vertexInverseMass1);
 		vertexVelocity0 += velocityChange0;
 		vertexVelocity1 -= velocityChange1;
 		g_vertexVelocity[node0] = (float4)(vertexVelocity0, 0.f);
 		g_vertexVelocity[node1] = (float4)(vertexVelocity1, 0.f);
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ApplyForces.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ApplyForces.cl
@@ -6,6 +6,18 @@ float adot3(float4 a, float4 b)
   return a.x*b.x + a.y*b.y + a.z*b.z;
 }
 float alength3(float4 a)
 {
 	a.w = 0;
 	return length(a);
 }
 float4 anormalize3(float4 a)
 {
 	a.w = 0;
 	return normalize(a);
 }
 float4 projectOnAxis( float4 v, float4 a )
 {
 	return (a*adot3(v, a));
@@ -53,36 +65,35 @@ ApplyForcesKernel(
 			g_vertexVelocity[nodeID] = nodeV;
-			float4 relativeWindVelocity = nodeV - clothWindVelocity;
+			// Aerodynamics
-			float relativeSpeedSquared = dot(relativeWindVelocity, relativeWindVelocity);
+			float4 rel_v = nodeV - clothWindVelocity;
 			float rel_v_len = alength3(rel_v);
 			float rel_v2 = dot(rel_v, rel_v);
-			if( relativeSpeedSquared > epsilon )
+			if( rel_v2 > epsilon )
 			{
-				// Correct direction of normal relative to wind direction and get dot product
+				float4 rel_v_nrm = anormalize3(rel_v);
-				normal = normal * (dot(normal, relativeWindVelocity) < 0 ? -1.f : 1.f);
+				float4 nrm = normal;
 				float dvNormal = dot(normal, relativeWindVelocity);
 				if( dvNormal > 0 )
 				{
 					float4 force = (float4)(0.f, 0.f, 0.f, 0.f);
 					float c0 = area * dvNormal * relativeSpeedSquared / 2.f;
 					float c1 = c0 * mediumDensity;
 					force += normal * (-c1 * liftFactor);
 					force += normalize(relativeWindVelocity)*(-c1 * dragFactor);
-					float dtim = solverdt * nodeIM;
+				nrm = nrm * (dot(nrm, rel_v) < 0 ? -1.f : 1.f);
 					float4 forceDTIM = force * dtim;
-					float4 nodeFPlusForce = nodeF + force;
+				float4 fDrag = (float4)(0.f, 0.f, 0.f, 0.f);
 				float4 fLift = (float4)(0.f, 0.f, 0.f, 0.f);
-					// m_nodesf[i] -= ProjectOnAxis(m_nodesv[i], force.normalized())/dtim;	
+				float n_dot_v = dot(nrm, rel_v_nrm);
 					float4 nodeFMinus = nodeF - (projectOnAxis(nodeV, normalize(force))/dtim);
-					nodeF = nodeFPlusForce;
+				// drag force
-					//if( dot(forceDTIM, forceDTIM) > dot(nodeV, nodeV) )
+				if ( dragFactor > 0.f )
-					//	nodeF = nodeFMinus;
+					fDrag = 0.5f * dragFactor * mediumDensity * rel_v2 * area * n_dot_v * (-1.0f) * rel_v_nrm;
 				// lift force
 				// Check angle of attack
 				// cos(10<EFBFBD>) = 0.98480
 				if ( 0 < n_dot_v && n_dot_v < 0.98480f)
 					fLift = 0.5f * liftFactor * mediumDensity * rel_v_len * area * sqrt(1.0f-n_dot_v*n_dot_v) * (cross(cross(nrm, rel_v_nrm), rel_v_nrm));
 				nodeF += fDrag + fLift;
 					g_vertexForceAccumulator[nodeID] = nodeF;	
 				}
 			}
 		}
 	}
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateFixedVertexPositions.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateFixedVertexPositions.cl
@@ -0,0 +1,25 @@
 MSTRINGIFY(
 __kernel void 
 UpdateFixedVertexPositions(
 	const uint numNodes,
 	__global int * g_anchorIndex,
 	__global float4 * g_vertexPositions,
 	__global float4 * g_anchorPositions GUID_ARG)
 {
 	unsigned int nodeID = get_global_id(0);
 	if( nodeID < numNodes )
 	{		
 		int anchorIndex  = g_anchorIndex[nodeID];
 		float4 position = g_vertexPositions[nodeID];
 		if ( anchorIndex >= 0 )
 		{
 			float4 anchorPosition = g_anchorPositions[anchorIndex];
 			g_vertexPositions[nodeID] = anchorPosition;
 		}
 	}		
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp
@@ -5,17 +5,10 @@
 #include "btSoftBodySolverVertexBuffer_OpenGL.h"
 #include "BulletSoftBody/btSoftBody.h"
 #if (0)//CL_VERSION_1_1 == 1)
 //OpenCL 1.1 kernels use float3
 #define MSTRINGIFY(A) #A
 static char* OutputToVertexArrayCLString =
 #include "OpenCLC/OutputToVertexArray.cl"
 #else
 ////OpenCL 1.0 kernels don't use float3
 #define MSTRINGIFY(A) #A
 static char* OutputToVertexArrayCLString =
 #include "OpenCLC10/OutputToVertexArray.cl"
 #endif //CL_VERSION_1_1
 #define RELEASE_CL_KERNEL(kernelName) {if( kernelName ){ clReleaseKernel( kernelName ); kernelName = 0; }}
@@ -105,8 +98,8 @@ bool btSoftBodySolverOutputCLtoGL::buildShaders()
 	if( m_shadersInitialized )
 		return true;
-	outputToVertexArrayWithNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithNormalsKernel" );
+	outputToVertexArrayWithNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithNormalsKernel" ,"","OpenCLC10/OutputToVertexArray.cl");
-	outputToVertexArrayWithoutNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithoutNormalsKernel" );
+	outputToVertexArrayWithoutNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithoutNormalsKernel" ,"","OpenCLC10/OutputToVertexArray.cl");
 	if( returnVal )
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
@@ -45,7 +45,7 @@ public:
 	virtual bool moveToAccelerator();
-	virtual bool moveFromAccelerator();
+	virtual bool moveFromAccelerator(bool bCopy = false, bool bCopyMinimum = true);
 };
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
@@ -20,10 +20,13 @@ subject to the following restrictions:
 #include "btSoftBodySolver_OpenCL.h"
 #include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
 #include "BulletSoftBody/btSoftBody.h"
 #include "BulletSoftBody/btSoftBodyInternals.h"
 #include "BulletCollision/CollisionShapes/btCapsuleShape.h"
 #include "BulletCollision/COllisionShapes/btSphereShape.h"
 #include "LinearMath/btQuickprof.h"
 #include <limits.h>
 #define BT_SUPPRESS_OPENCL_ASSERTS
 #ifdef USE_MINICL
@@ -36,8 +39,7 @@ subject to the following restrictions:
 	#endif //__APPLE__
 #endif//USE_MINICL
-#define BT_DEFAULT_WORKGROUPSIZE 128
+#define BT_DEFAULT_WORKGROUPSIZE 64
 #define RELEASE_CL_KERNEL(kernelName) {if( kernelName ){ clReleaseKernel( kernelName ); kernelName = 0; }}
@@ -45,32 +47,6 @@ subject to the following restrictions:
 //CL_VERSION_1_1 seems broken on NVidia SDK so just disable it
 #if (0)//CL_VERSION_1_1 == 1)
 //OpenCL 1.1 kernels use float3
 #define MSTRINGIFY(A) #A
 static const char* PrepareLinksCLString = 
 #include "OpenCLC/PrepareLinks.cl"
 static const char* UpdatePositionsFromVelocitiesCLString = 
 #include "OpenCLC/UpdatePositionsFromVelocities.cl"
 static const char* SolvePositionsCLString = 
 #include "OpenCLC/SolvePositions.cl"
 static const char* UpdateNodesCLString = 
 #include "OpenCLC/UpdateNodes.cl"
 static const char* UpdatePositionsCLString = 
 #include "OpenCLC/UpdatePositions.cl"
 static const char* UpdateConstantsCLString = 
 #include "OpenCLC/UpdateConstants.cl"
 static const char* IntegrateCLString = 
 #include "OpenCLC/Integrate.cl"
 static const char* ApplyForcesCLString = 
 #include "OpenCLC/ApplyForces.cl"
 static const char* UpdateNormalsCLString = 
 #include "OpenCLC/UpdateNormals.cl"
 static const char* VSolveLinksCLString = 
 #include "OpenCLC/VSolveLinks.cl"
 static const char* SolveCollisionsAndUpdateVelocitiesCLString =
 #include "OpenCLC/SolveCollisionsAndUpdateVelocities.cl"
 #else
 ////OpenCL 1.0 kernels don't use float3
 #define MSTRINGIFY(A) #A
 static const char* PrepareLinksCLString = 
@@ -89,13 +65,14 @@ static const char* IntegrateCLString =
 #include "OpenCLC10/Integrate.cl"
 static const char* ApplyForcesCLString = 
 #include "OpenCLC10/ApplyForces.cl"
 static const char* UpdateFixedVertexPositionsCLString = 
 #include "OpenCLC10/UpdateFixedVertexPositions.cl"
 static const char* UpdateNormalsCLString = 
 #include "OpenCLC10/UpdateNormals.cl"
 static const char* VSolveLinksCLString = 
 #include "OpenCLC10/VSolveLinks.cl"
 static const char* SolveCollisionsAndUpdateVelocitiesCLString =
 #include "OpenCLC10/SolveCollisionsAndUpdateVelocities.cl"
 #endif //CL_VERSION_1_1
 btSoftBodyVertexDataOpenCL::btSoftBodyVertexDataOpenCL( cl_command_queue queue, cl_context ctx) :
@@ -141,18 +118,42 @@ bool btSoftBodyVertexDataOpenCL::moveToAccelerator()
 	return success;
 }
-bool btSoftBodyVertexDataOpenCL::moveFromAccelerator()
+bool btSoftBodyVertexDataOpenCL::moveFromAccelerator(bool bCopy, bool bCopyMinimum)
 {
 	bool success = true;
-	success = success && m_clClothIdentifier.moveFromGPU();
+
-	success = success && m_clVertexPosition.moveFromGPU();
+	if (!bCopy)
-	success = success && m_clVertexPreviousPosition.moveFromGPU();
+	{
-	success = success && m_clVertexVelocity.moveFromGPU();
+		success = success && m_clClothIdentifier.moveFromGPU();
-	success = success && m_clVertexForceAccumulator.moveFromGPU();
+		success = success && m_clVertexPosition.moveFromGPU();
-	success = success && m_clVertexNormal.moveFromGPU();
+		success = success && m_clVertexPreviousPosition.moveFromGPU();
-	success = success && m_clVertexInverseMass.moveFromGPU();
+		success = success && m_clVertexVelocity.moveFromGPU();
-	success = success && m_clVertexArea.moveFromGPU();
+		success = success && m_clVertexForceAccumulator.moveFromGPU();
-	success = success && m_clVertexTriangleCount.moveFromGPU();
+		success = success && m_clVertexNormal.moveFromGPU();
 		success = success && m_clVertexInverseMass.moveFromGPU();
 		success = success && m_clVertexArea.moveFromGPU();
 		success = success && m_clVertexTriangleCount.moveFromGPU();
 	}
 	else
 	{
 		if (bCopyMinimum)
 		{
 			success = success && m_clVertexPosition.copyFromGPU();
 			success = success && m_clVertexNormal.copyFromGPU();
 		}
 		else
 		{
 			success = success && m_clClothIdentifier.copyFromGPU();
 			success = success && m_clVertexPosition.copyFromGPU();
 			success = success && m_clVertexPreviousPosition.copyFromGPU();
 			success = success && m_clVertexVelocity.copyFromGPU();
 			success = success && m_clVertexForceAccumulator.copyFromGPU();
 			success = success && m_clVertexNormal.copyFromGPU();
 			success = success && m_clVertexInverseMass.copyFromGPU();
 			success = success && m_clVertexArea.copyFromGPU();
 			success = success && m_clVertexTriangleCount.copyFromGPU();
 		}
 	}
 	if( success )
 		m_onGPU = true;
@@ -160,9 +161,6 @@ bool btSoftBodyVertexDataOpenCL::moveFromAccelerator()
 	return success;
 }
 btSoftBodyLinkDataOpenCL::btSoftBodyLinkDataOpenCL(cl_command_queue queue,  cl_context ctx) 
 :m_cqCommandQue(queue),
 	m_clLinks( queue, ctx, &m_links, false ),
@@ -602,11 +600,12 @@ void btSoftBodyTriangleDataOpenCL::generateBatches()
-btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_context ctx) :
+btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_context ctx, bool bUpdateAchchoredNodePos) :
 	m_linkData(queue, ctx),
 	m_vertexData(queue, ctx),
 	m_triangleData(queue, ctx),
-	m_clFunctions(queue, ctx),
+	m_defaultCLFunctions(queue, ctx),
 	m_currentCLFunctions(&m_defaultCLFunctions),
 	m_clPerClothAcceleration(queue, ctx, &m_perClothAcceleration, true ),
 	m_clPerClothWindVelocity(queue, ctx, &m_perClothWindVelocity, true ),
 	m_clPerClothDampingFactor(queue,ctx, &m_perClothDampingFactor, true ),
@@ -617,10 +616,14 @@ btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_contex
 	m_clPerClothCollisionObjects( queue, ctx, &m_perClothCollisionObjects, true ),
 	m_clCollisionObjectDetails( queue, ctx, &m_collisionObjectDetails, true ),
 	m_clPerClothFriction( queue, ctx, &m_perClothFriction, false ),
 	m_clAnchorPosition( queue, ctx, &m_anchorPosition, true ),
 	m_clAnchorIndex( queue, ctx, &m_anchorIndex, true),
 	m_cqCommandQue( queue ),
 	m_cxMainContext(ctx),
-	m_defaultWorkGroupSize(BT_DEFAULT_WORKGROUPSIZE)
+	m_defaultWorkGroupSize(BT_DEFAULT_WORKGROUPSIZE),
 	m_bUpdateAnchoredNodePos(bUpdateAchchoredNodePos)
 {
 	// Initial we will clearly need to update solver constants
 	// For now this is global for the cloths linked with this solver - we should probably make this body specific 
 	// for performance in future once we understand more clearly when constants need to be updated
@@ -643,6 +646,7 @@ btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_contex
 	m_normalizeNormalsAndAreasKernel = 0;
 	m_outputToVertexArrayKernel = 0;
 	m_applyForcesKernel = 0;
 	m_updateFixedVertexPositionsKernel = 0;
 }
 btOpenCLSoftBodySolver::~btOpenCLSoftBodySolver()
@@ -666,14 +670,16 @@ void btOpenCLSoftBodySolver::releaseKernels()
 	RELEASE_CL_KERNEL( m_normalizeNormalsAndAreasKernel );
 	RELEASE_CL_KERNEL( m_outputToVertexArrayKernel );
 	RELEASE_CL_KERNEL( m_applyForcesKernel );
 	RELEASE_CL_KERNEL( m_updateFixedVertexPositionsKernel );
 	m_shadersInitialized = false;
 }
-void btOpenCLSoftBodySolver::copyBackToSoftBodies()
+void btOpenCLSoftBodySolver::copyBackToSoftBodies(bool bMove)
 {
 	// Move the vertex data back to the host first
-	m_vertexData.moveFromAccelerator();
+	m_vertexData.moveFromAccelerator(!bMove);
 	// Loop over soft bodies, copying all the vertex positions back for each body in turn
 	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
@@ -688,15 +694,16 @@ void btOpenCLSoftBodySolver::copyBackToSoftBodies()
 		for( int vertex = 0; vertex < numVertices; ++vertex )
 		{
 			using Vectormath::Aos::Point3;
-			Point3 vertexPosition( getVertexData().getVertexPositions()[firstVertex + vertex] );
+			Point3 vertexPosition( m_vertexData.getVertexPositions()[firstVertex + vertex] );
 			Point3 normal(m_vertexData.getNormal(firstVertex + vertex));
 			softBody->m_nodes[vertex].m_x.setX( vertexPosition.getX() );
 			softBody->m_nodes[vertex].m_x.setY( vertexPosition.getY() );
 			softBody->m_nodes[vertex].m_x.setZ( vertexPosition.getZ() );
-			softBody->m_nodes[vertex].m_n.setX( vertexPosition.getX() );
+			softBody->m_nodes[vertex].m_n.setX( normal.getX() );
-			softBody->m_nodes[vertex].m_n.setY( vertexPosition.getY() );
+			softBody->m_nodes[vertex].m_n.setY( normal.getY() );
-			softBody->m_nodes[vertex].m_n.setZ( vertexPosition.getZ() );
+			softBody->m_nodes[vertex].m_n.setZ( normal.getZ() );
 		}
 	}	
 } // btOpenCLSoftBodySolver::copyBackToSoftBodies
@@ -710,7 +717,10 @@ void btOpenCLSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &sof
 		getTriangleData().clear();
 		getLinkData().clear();
 		m_softBodySet.resize(0);
 		m_anchorIndex.clear();
 		int maxPiterations = 0;
 		int maxViterations = 0;
 		for( int softBodyIndex = 0; softBodyIndex < softBodies.size(); ++softBodyIndex )
 		{
@@ -759,6 +769,8 @@ void btOpenCLSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &sof
 				float vertexInverseMass = softBody->m_nodes[vertex].m_im;
 				desc.setInverseMass(vertexInverseMass);
 				getVertexData().setVertexAt( desc, firstVertex + vertex );
 				m_anchorIndex.push_back(-1.0);
 			}
 			// Copy triangles similarly
@@ -805,13 +817,76 @@ void btOpenCLSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &sof
 			newSoftBody->setMaxTriangles( maxTriangles );
 			newSoftBody->setFirstLink( firstLink );
 			newSoftBody->setNumLinks( numLinks );
 			// Find maximum piterations and viterations
 			int piterations = softBody->m_cfg.piterations;
            if ( piterations > maxPiterations )
                  maxPiterations = piterations;
            int viterations = softBody->m_cfg.viterations;
 			if ( viterations > maxViterations )
                  maxViterations = viterations;
 			// zero mass
 			for( int vertex = 0; vertex < numVertices; ++vertex )
 			{
 				if ( softBody->m_nodes[vertex].m_im == 0 )
 				{
 					AnchorNodeInfoCL nodeInfo;
 					nodeInfo.clVertexIndex = firstVertex + vertex;
 					nodeInfo.pNode = &softBody->m_nodes[vertex];
 					m_anchorNodeInfoArray.push_back(nodeInfo);
 				}
 			}			
 			// anchor position
 			if ( numVertices > 0 )
 			{
 				for ( int anchorIndex = 0; anchorIndex < softBody->m_anchors.size(); anchorIndex++ )
 				{
 					btSoftBody::Node* anchorNode = softBody->m_anchors[anchorIndex].m_node;
 					btSoftBody::Node* firstNode = &softBody->m_nodes[0];
 					AnchorNodeInfoCL nodeInfo;
 					nodeInfo.clVertexIndex = firstVertex + (int)(anchorNode - firstNode);
 					nodeInfo.pNode = anchorNode;
 					m_anchorNodeInfoArray.push_back(nodeInfo);
 				}
 			}			
 		}
 		m_anchorPosition.clear();		
 		m_anchorPosition.resize(m_anchorNodeInfoArray.size());
 		for ( int anchorNode = 0; anchorNode < m_anchorNodeInfoArray.size(); anchorNode++ )
 		{
 			const AnchorNodeInfoCL& anchorNodeInfo = m_anchorNodeInfoArray[anchorNode];
 			m_anchorIndex[anchorNodeInfo.clVertexIndex] = anchorNode;
 			getVertexData().getInverseMass(anchorNodeInfo.clVertexIndex) = 0.0f;
 		}
 		updateConstants(0.f);
 		// set position and velocity iterations
 		setNumberOfPositionIterations(maxPiterations);
 		setNumberOfVelocityIterations(maxViterations);
 		// set wind velocity
 		m_perClothWindVelocity.resize( m_softBodySet.size() );
 		for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
 		{
 			btSoftBody *softBody = m_softBodySet[softBodyIndex]->getSoftBody();			
 			m_perClothWindVelocity[softBodyIndex] = toVector3(softBody->getWindVelocity());
 		}
 		m_clPerClothWindVelocity.changedOnCPU();
 		// generate batches
 		m_linkData.generateBatches();		
 		m_triangleData.generateBatches();
@@ -861,7 +936,6 @@ void btOpenCLSoftBodySolver::resetNormalsAndAreas( int numVertices )
 void btOpenCLSoftBodySolver::normalizeNormalsAndAreas( int numVertices )
 {
 	cl_int ciErrNum;
 	ciErrNum = clSetKernelArg(m_normalizeNormalsAndAreasKernel, 0, sizeof(int),(void*) &numVertices);
@@ -882,7 +956,6 @@ void btOpenCLSoftBodySolver::normalizeNormalsAndAreas( int numVertices )
 void btOpenCLSoftBodySolver::executeUpdateSoftBodies( int firstTriangle, int numTriangles )
 {
 	cl_int ciErrNum;
 	ciErrNum = clSetKernelArg(m_updateSoftBodiesKernel, 0, sizeof(int), (void*) &firstTriangle);
 	ciErrNum = clSetKernelArg(m_updateSoftBodiesKernel, 1, sizeof(int), &numTriangles);
@@ -948,9 +1021,34 @@ void btOpenCLSoftBodySolver::ApplyClampedForce( float solverdt, const Vectormath
 	}
 }
 void btOpenCLSoftBodySolver::updateFixedVertexPositions()
 {	
 	// Ensure data is on accelerator
 	m_vertexData.moveToAccelerator();
 	m_clAnchorPosition.moveToGPU();
 	m_clAnchorIndex.moveToGPU();
 	cl_int ciErrNum ;
 	int numVerts = m_vertexData.getNumVertices();
 	ciErrNum = clSetKernelArg(m_updateFixedVertexPositionsKernel, 0, sizeof(int), &numVerts);
 	ciErrNum = clSetKernelArg(m_updateFixedVertexPositionsKernel,1, sizeof(cl_mem), &m_clAnchorIndex.m_buffer);
 	ciErrNum = clSetKernelArg(m_updateFixedVertexPositionsKernel,2, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
 	ciErrNum = clSetKernelArg(m_updateFixedVertexPositionsKernel,3, sizeof(cl_mem), &m_clAnchorPosition.m_buffer);
 	size_t numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
 	if (numWorkItems)
 	{
 		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_updateFixedVertexPositionsKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize, 0,0,0);
 		if( ciErrNum != CL_SUCCESS ) 
 		{
 			btAssert( 0 &&  "enqueueNDRangeKernel(m_updateFixedVertexPositionsKernel)");
 		}
 	}
 }
 void btOpenCLSoftBodySolver::applyForces( float solverdt )
 {	
 	// Ensure data is on accelerator
 	m_vertexData.moveToAccelerator();
 	m_clPerClothAcceleration.moveToGPU();
@@ -976,6 +1074,7 @@ void btOpenCLSoftBodySolver::applyForces( float solverdt )
 	ciErrNum = clSetKernelArg(m_applyForcesKernel,11, sizeof(cl_mem), &m_clPerClothMediumDensity.m_buffer);
 	ciErrNum = clSetKernelArg(m_applyForcesKernel,12, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);
 	ciErrNum = clSetKernelArg(m_applyForcesKernel,13, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
 	size_t numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
 	if (numWorkItems)
 	{
@@ -993,8 +1092,6 @@ void btOpenCLSoftBodySolver::applyForces( float solverdt )
 */
 void btOpenCLSoftBodySolver::integrate( float solverdt )
 {
 	// Ensure data is on accelerator
 	m_vertexData.moveToAccelerator();
@@ -1186,7 +1283,7 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 		updateVelocitiesFromPositionsWithoutVelocities( 1.f/solverdt );
 	}
-	// Solve drift
+	// Solve position
 	for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
 	{
 		for( int i = 0; i < m_linkData.m_batchStartLengths.size(); ++i )
@@ -1210,7 +1307,6 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 // Kernel dispatches
 void btOpenCLSoftBodySolver::prepareLinks()
 {
 	cl_int ciErrNum;
 	int numLinks = m_linkData.getNumLinks();
 	ciErrNum = clSetKernelArg(m_prepareLinksKernel,0, sizeof(int), &numLinks);
@@ -1231,7 +1327,6 @@ void btOpenCLSoftBodySolver::prepareLinks()
 void btOpenCLSoftBodySolver::updatePositionsFromVelocities( float solverdt )
 {
 	cl_int ciErrNum;
 	int numVerts = m_vertexData.getNumVertices();
 	ciErrNum = clSetKernelArg(m_updatePositionsFromVelocitiesKernel,0, sizeof(int), &numVerts);
@@ -1251,7 +1346,6 @@ void btOpenCLSoftBodySolver::updatePositionsFromVelocities( float solverdt )
 void btOpenCLSoftBodySolver::solveLinksForPosition( int startLink, int numLinks, float kst, float ti )
 {
 	cl_int ciErrNum;
 	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,0, sizeof(int), &startLink);
 	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,1, sizeof(int), &numLinks);
@@ -1275,7 +1369,6 @@ void btOpenCLSoftBodySolver::solveLinksForPosition( int startLink, int numLinks,
 void btOpenCLSoftBodySolver::solveLinksForVelocity( int startLink, int numLinks, float kst )
 {
 	cl_int ciErrNum;
 	ciErrNum = clSetKernelArg(m_vSolveLinksKernel, 0, sizeof(int), &startLink);
 	ciErrNum = clSetKernelArg(m_vSolveLinksKernel, 1, sizeof(int), &numLinks);
@@ -1297,7 +1390,6 @@ void btOpenCLSoftBodySolver::solveLinksForVelocity( int startLink, int numLinks,
 void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithVelocities( float isolverdt )
 {
 	cl_int ciErrNum;
 	int numVerts = m_vertexData.getNumVertices();
 	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithVelocitiesKernel,0, sizeof(int), &numVerts);
@@ -1322,7 +1414,6 @@ void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithVelocities( float
 void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities( float isolverdt )
 {
 	cl_int ciErrNum;
 	int numVerts = m_vertexData.getNumVertices();
 	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithoutVelocitiesKernel, 0, sizeof(int), &numVerts);
@@ -1347,7 +1438,6 @@ void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities( flo
 void btOpenCLSoftBodySolver::solveCollisionsAndUpdateVelocities( float isolverdt )
 {
 	// Copy kernel parameters to GPU
 	m_vertexData.moveToAccelerator();
 	m_clPerClothFriction.moveToGPU();
@@ -1355,7 +1445,6 @@ void btOpenCLSoftBodySolver::solveCollisionsAndUpdateVelocities( float isolverdt
 	m_clPerClothCollisionObjects.moveToGPU();
 	m_clCollisionObjectDetails.moveToGPU();
 	cl_int ciErrNum;
 	int numVerts = m_vertexData.getNumVertices();
 	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 0, sizeof(int), &numVerts);
@@ -1446,7 +1535,7 @@ void btSoftBodySolverOutputCLtoCPU::copySoftBodyToVertexBuffer( const btSoftBody
-cl_kernel CLFunctions::compileCLKernelFromString( const char* kernelSource, const char* kernelName, const char* additionalMacros )
+cl_kernel CLFunctions::compileCLKernelFromString( const char* kernelSource, const char* kernelName, const char* additionalMacros ,const char* orgSrcFileNameForCaching)
 {
 	printf("compiling kernelName: %s ",kernelName);
 	cl_kernel kernel=0;
@@ -1554,20 +1643,25 @@ void btOpenCLSoftBodySolver::predictMotion( float timeStep )
 	m_clPerClothCollisionObjects.changedOnCPU();
 	m_collisionObjectDetails.clear();	
 	if ( m_bUpdateAnchoredNodePos )
 	{
-		BT_PROFILE("perClothWindVelocity");
+		// In OpenCL cloth solver, if softbody node has zero inverse mass(infinite mass) or anchor attached, 
-		// Fill the force arrays with current acceleration data etc
+		// we need to update the node position in case the node or anchor is animated externally.
-		m_perClothWindVelocity.resize( m_softBodySet.size() );
+		// If there is no such node, we can eliminate the unnecessary CPU-to-GPU data trasferring. 
-		for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
+		for ( int i = 0; i < m_anchorNodeInfoArray.size(); i++ )
 		{
-			btSoftBody *softBody = m_softBodySet[softBodyIndex]->getSoftBody();
+			const AnchorNodeInfoCL& anchorNodeInfo = m_anchorNodeInfoArray[i];
 			btSoftBody::Node* node = anchorNodeInfo.pNode;
-			m_perClothWindVelocity[softBodyIndex] = toVector3(softBody->getWindVelocity());
+			using Vectormath::Aos::Point3;
 			Point3 pos((float)node->m_x.getX(), (float)node->m_x.getY(), (float)node->m_x.getZ());				
 			m_anchorPosition[i] = pos;
 		}
-	}
+
-	{
+		if ( m_anchorNodeInfoArray.size() > 0 )
-		BT_PROFILE("changedOnCPU");
+			m_clAnchorPosition.changedOnCPU();
-		m_clPerClothWindVelocity.changedOnCPU();
+
 		updateFixedVertexPositions();
 	}
 	{
@@ -1601,7 +1695,7 @@ static Vectormath::Aos::Transform3 toTransform3( const btTransform &transform )
 void btOpenCLAcceleratedSoftBodyInterface::updateBounds( const btVector3 &lowerBound, const btVector3 &upperBound )
 {
-	float scalarMargin = this->getSoftBody()->getCollisionShape()->getMargin();
+	float scalarMargin = (float)getSoftBody()->getCollisionShape()->getMargin();
 	btVector3 vectorMargin( scalarMargin, scalarMargin, scalarMargin );
 	m_softBody->m_bounds[0] = lowerBound - vectorMargin;
 	m_softBody->m_bounds[1] = upperBound + vectorMargin;
@@ -1641,7 +1735,8 @@ void btOpenCLSoftBodySolver::processCollision( btSoftBody *softBody, btCollision
 			newCollisionShapeDescription.angularVelocity = toVector3(body->getAngularVelocity());
 			m_collisionObjectDetails.push_back( newCollisionShapeDescription );
-		} else {
+		} 		
 		else {
 			printf("Unsupported collision shape type\n");
 			//btAssert(0 && "Unsupported collision shape type\n");
 		}
@@ -1688,32 +1783,36 @@ bool btOpenCLSoftBodySolver::checkInitialized()
 bool btOpenCLSoftBodySolver::buildShaders()
 {
 	// Ensure current kernels are released first
 	releaseKernels();
 	if( m_shadersInitialized )
 		return true;
-	m_clFunctions.clearKernelCompilationFailures();
+	const char* additionalMacros="";
-	m_prepareLinksKernel = m_clFunctions.compileCLKernelFromString( PrepareLinksCLString, "PrepareLinksKernel" );
+	// Ensure current kernels are released first
-	m_updatePositionsFromVelocitiesKernel = m_clFunctions.compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel" );
+	releaseKernels();
-	m_solvePositionsFromLinksKernel = m_clFunctions.compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel" );
+	
-	m_vSolveLinksKernel = m_clFunctions.compileCLKernelFromString( VSolveLinksCLString, "VSolveLinksKernel" );
+	m_currentCLFunctions->clearKernelCompilationFailures();
-	m_updateVelocitiesFromPositionsWithVelocitiesKernel = m_clFunctions.compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel" );
+
-	m_updateVelocitiesFromPositionsWithoutVelocitiesKernel = m_clFunctions.compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel" );
+	m_prepareLinksKernel = m_currentCLFunctions->compileCLKernelFromString( PrepareLinksCLString, "PrepareLinksKernel",additionalMacros,"OpenCLC10/PrepareLinks.cl" );
-	m_solveCollisionsAndUpdateVelocitiesKernel = m_clFunctions.compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel" );
+	m_updatePositionsFromVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel" ,additionalMacros,"OpenCLC10/UpdatePositionsFromVelocities.cl");
-	m_integrateKernel = m_clFunctions.compileCLKernelFromString( IntegrateCLString, "IntegrateKernel" );
+	m_solvePositionsFromLinksKernel = m_currentCLFunctions->compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel",additionalMacros,"OpenCLC10/SolvePositions.cl" );
-	m_applyForcesKernel = m_clFunctions.compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel" );
+	m_vSolveLinksKernel = m_currentCLFunctions->compileCLKernelFromString( VSolveLinksCLString, "VSolveLinksKernel" ,additionalMacros,"OpenCLC10/VSolveLinks.cl");
 	m_updateVelocitiesFromPositionsWithVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel" ,additionalMacros,"OpenCLC10/UpdateNodes.cl");
 	m_updateVelocitiesFromPositionsWithoutVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel" ,additionalMacros,"OpenCLC10/UpdatePositions.cl");
 	m_solveCollisionsAndUpdateVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel" ,additionalMacros,"OpenCLC10/SolveCollisionsAndUpdateVelocities.cl");
 	m_integrateKernel = m_currentCLFunctions->compileCLKernelFromString( IntegrateCLString, "IntegrateKernel" ,additionalMacros,"OpenCLC10/Integrate.cl");
 	m_applyForcesKernel = m_currentCLFunctions->compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel" ,additionalMacros,"OpenCLC10/ApplyForces.cl");
 	m_updateFixedVertexPositionsKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateFixedVertexPositionsCLString, "UpdateFixedVertexPositions" , additionalMacros, "OpenCLC10/UpdateFixedVertexPositions.cl");
 	// TODO: Rename to UpdateSoftBodies
-	m_resetNormalsAndAreasKernel = m_clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel" );
+	m_resetNormalsAndAreasKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel" ,additionalMacros,"OpenCLC10/UpdateNormals.cl");
-	m_normalizeNormalsAndAreasKernel = m_clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel" );
+	m_normalizeNormalsAndAreasKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel" ,additionalMacros,"OpenCLC10/UpdateNormals.cl");
-	m_updateSoftBodiesKernel = m_clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel" );
+	m_updateSoftBodiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel" ,additionalMacros,"OpenCLC10/UpdateNormals.cl");
-	if( m_clFunctions.getKernelCompilationFailures()==0 )
+	if( m_currentCLFunctions->getKernelCompilationFailures()==0 )
 		m_shadersInitialized = true;
 	return m_shadersInitialized;
 }
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
@@ -20,6 +20,7 @@ subject to the following restrictions:
 #include "vectormath/vmInclude.h"
 #include "BulletSoftBody/btSoftBodySolvers.h"
 #include "BulletSoftBody/btSoftBody.h"
 #include "btSoftBodySolverBuffer_OpenCL.h"
 #include "btSoftBodySolverLinkData_OpenCL.h"
 #include "btSoftBodySolverVertexData_OpenCL.h"
@@ -33,6 +34,7 @@ protected:
 	int	m_kernelCompilationFailures;
 public:
 	CLFunctions(cl_command_queue cqCommandQue, cl_context cxMainContext) :
 		m_cqCommandQue( cqCommandQue ),
@@ -49,7 +51,7 @@ public:
 	/**
 	 * Compile a compute shader kernel from a string and return the appropriate cl_kernel object.
 	 */	
-	cl_kernel compileCLKernelFromString( const char* kernelSource, const char* kernelName, const char* additionalMacros = "" );
+	virtual cl_kernel compileCLKernelFromString( const char* kernelSource, const char* kernelName, const char* additionalMacros, const char* srcFileNameForCaching);
 	void	clearKernelCompilationFailures()
 	{
@@ -285,7 +287,8 @@ public:
 protected:
-	CLFunctions m_clFunctions;
+	CLFunctions m_defaultCLFunctions;
 	CLFunctions* m_currentCLFunctions;
 	/** Variable to define whether we need to update solver constants on the next iteration */
 	bool m_updateSolverConstants;
@@ -350,7 +353,20 @@ protected:
 	btAlignedObjectArray< float >	m_perClothFriction;
 	btOpenCLBuffer< float >			m_clPerClothFriction;
 	// anchor node info
 	struct AnchorNodeInfoCL
 	{
 		int clVertexIndex;
 		btSoftBody::Node* pNode;
 	};
 	btAlignedObjectArray<AnchorNodeInfoCL> m_anchorNodeInfoArray;
 	btAlignedObjectArray<Vectormath::Aos::Point3> m_anchorPosition;
 	btOpenCLBuffer<Vectormath::Aos::Point3>		  m_clAnchorPosition;
 	btAlignedObjectArray<int> m_anchorIndex;
 	btOpenCLBuffer<int>		  m_clAnchorIndex;
 	bool m_bUpdateAnchoredNodePos;
 	cl_kernel		m_prepareLinksKernel;
 	cl_kernel		m_solvePositionsFromLinksKernel;
@@ -368,6 +384,7 @@ protected:
 	cl_kernel		m_outputToVertexArrayKernel;
 	cl_kernel		m_applyForcesKernel;
 	cl_kernel       m_updateFixedVertexPositionsKernel;	
 	cl_command_queue	m_cqCommandQue;
 	cl_context			m_cxMainContext;
@@ -394,6 +411,8 @@ protected:
 	virtual void applyForces( float solverdt );
 	void updateFixedVertexPositions();
 	/**
 	 * Integrate motion on the solver.
 	 */
@@ -430,7 +449,7 @@ protected:
 	void releaseKernels();
 public:
-	btOpenCLSoftBodySolver(cl_command_queue queue,cl_context	ctx);
+	btOpenCLSoftBodySolver(cl_command_queue queue,cl_context	ctx, bool bUpdateAchchoredNodePos = false);
 	virtual ~btOpenCLSoftBodySolver();
@@ -456,7 +475,7 @@ public:
 	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);
-	virtual void copyBackToSoftBodies();
+	virtual void copyBackToSoftBodies(bool bMove = true);
 	virtual void solveConstraints( float solverdt );
@@ -475,6 +494,14 @@ public:
 		return m_defaultWorkGroupSize;
 	}
 	void	setCLFunctions(CLFunctions* funcs)
 	{
 		if (funcs)
 			m_currentCLFunctions = funcs;
 		else
 			m_currentCLFunctions  = &m_defaultCLFunctions;
 	}
 }; // btOpenCLSoftBodySolver
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
@@ -33,32 +33,6 @@ static const size_t workGroupSize = GROUP_SIZE;
 //CL_VERSION_1_1 seems broken on NVidia SDK so just disable it
 #if (0)//CL_VERSION_1_1 == 1)
 //OpenCL 1.1 kernels use float3
 #define MSTRINGIFY(A) #A
 static const char* UpdatePositionsFromVelocitiesCLString = 
 #include "OpenCLC/UpdatePositionsFromVelocities.cl"
 static const char* SolvePositionsCLString = 
 #include "OpenCLC/SolvePositionsSIMDBatched.cl"
 static const char* UpdateNodesCLString = 
 #include "OpenCLC/UpdateNodes.cl"
 static const char* UpdatePositionsCLString = 
 #include "OpenCLC/UpdatePositions.cl"
 static const char* UpdateConstantsCLString = 
 #include "OpenCLC/UpdateConstants.cl"
 static const char* IntegrateCLString = 
 #include "OpenCLC/Integrate.cl"
 static const char* ApplyForcesCLString = 
 #include "OpenCLC/ApplyForces.cl"
 static const char* UpdateNormalsCLString = 
 #include "OpenCLC/UpdateNormals.cl"
 static const char* VSolveLinksCLString = 
 #include "OpenCLC/VSolveLinks.cl"
 static const char* SolveCollisionsAndUpdateVelocitiesCLString =
 #include "OpenCLC/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl"
 static const char* OutputToVertexArrayCLString =
 #include "OpenCLC/OutputToVertexArray.cl"
 #else
 ////OpenCL 1.0 kernels don't use float3
 #define MSTRINGIFY(A) #A
 static const char* UpdatePositionsFromVelocitiesCLString = 
@@ -75,6 +49,8 @@ static const char* IntegrateCLString =
 #include "OpenCLC10/Integrate.cl"
 static const char* ApplyForcesCLString = 
 #include "OpenCLC10/ApplyForces.cl"
 static const char* UpdateFixedVertexPositionsCLString = 
 #include "OpenCLC10/UpdateFixedVertexPositions.cl"
 static const char* UpdateNormalsCLString = 
 #include "OpenCLC10/UpdateNormals.cl"
 static const char* VSolveLinksCLString = 
@@ -83,7 +59,6 @@ static const char* SolveCollisionsAndUpdateVelocitiesCLString =
 #include "OpenCLC10/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl"
 static const char* OutputToVertexArrayCLString =
 #include "OpenCLC10/OutputToVertexArray.cl"
 #endif //CL_VERSION_1_1
@@ -194,8 +169,8 @@ bool btSoftBodyLinkDataOpenCLSIMDAware::moveFromAccelerator()
-btOpenCLSoftBodySolverSIMDAware::btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue, cl_context ctx) :
+btOpenCLSoftBodySolverSIMDAware::btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue, cl_context ctx, bool bUpdateAchchoredNodePos) :
-	btOpenCLSoftBodySolver( queue, ctx ),
+	btOpenCLSoftBodySolver( queue, ctx, bUpdateAchchoredNodePos ),
 	m_linkData(queue, ctx)
 {
 	// Initial we will clearly need to update solver constants
@@ -213,14 +188,17 @@ btOpenCLSoftBodySolverSIMDAware::~btOpenCLSoftBodySolverSIMDAware()
 void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody * > &softBodies ,bool forceUpdate)
 {
-	if( forceUpdate|| m_softBodySet.size() != softBodies.size() )
+	if( forceUpdate || m_softBodySet.size() != softBodies.size() )
 	{
 		// Have a change in the soft body set so update, reloading all the data
 		getVertexData().clear();
 		getTriangleData().clear();
 		getLinkData().clear();
 		m_softBodySet.resize(0);
 		m_anchorIndex.clear();
 		int maxPiterations = 0;
 		int maxViterations = 0;
 		for( int softBodyIndex = 0; softBodyIndex < softBodies.size(); ++softBodyIndex )
 		{
@@ -238,8 +216,7 @@ void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody
 			m_perClothLiftFactor.push_back( softBody->m_cfg.kLF );
 			m_perClothDragFactor.push_back( softBody->m_cfg.kDG );
 			m_perClothMediumDensity.push_back(softBody->getWorldInfo()->air_density);
-
+			// Simple init values. Actually we'll put 0 and -1 into them at the appropriate time
 			m_perClothFriction.push_back( softBody->getFriction() );
 			m_perClothCollisionObjects.push_back( CollisionObjectIndices(-1, -1) );
@@ -253,6 +230,7 @@ void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody
 			// Allocate space for new vertices in all the vertex arrays
 			getVertexData().createVertices( numVertices, softBodyIndex, maxVertices );
 			int firstTriangle = getTriangleData().getNumTriangles();
 			int numTriangles = softBody->m_faces.size();
 			int maxTriangles = numTriangles;
@@ -272,6 +250,8 @@ void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody
 				float vertexInverseMass = softBody->m_nodes[vertex].m_im;
 				desc.setInverseMass(vertexInverseMass);
 				getVertexData().setVertexAt( desc, firstVertex + vertex );
 				m_anchorIndex.push_back(-1.0);
 			}
 			// Copy triangles similarly
@@ -318,17 +298,78 @@ void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody
 			newSoftBody->setMaxTriangles( maxTriangles );
 			newSoftBody->setFirstLink( firstLink );
 			newSoftBody->setNumLinks( numLinks );
 			// Find maximum piterations and viterations
 			int piterations = softBody->m_cfg.piterations;
            if ( piterations > maxPiterations )
                  maxPiterations = piterations;
            int viterations = softBody->m_cfg.viterations;
 			if ( viterations > maxViterations )
                  maxViterations = viterations;
 			// zero mass
 			for( int vertex = 0; vertex < numVertices; ++vertex )
 			{
 				if ( softBody->m_nodes[vertex].m_im == 0 )
 				{
 					AnchorNodeInfoCL nodeInfo;
 					nodeInfo.clVertexIndex = firstVertex + vertex;
 					nodeInfo.pNode = &softBody->m_nodes[vertex];
 					m_anchorNodeInfoArray.push_back(nodeInfo);
 				}
 			}			
 			// anchor position
 			if ( numVertices > 0 )
 			{
 				for ( int anchorIndex = 0; anchorIndex < softBody->m_anchors.size(); anchorIndex++ )
 				{
 					btSoftBody::Node* anchorNode = softBody->m_anchors[anchorIndex].m_node;
 					btSoftBody::Node* firstNode = &softBody->m_nodes[0];
 					AnchorNodeInfoCL nodeInfo;
 					nodeInfo.clVertexIndex = firstVertex + (int)(anchorNode - firstNode);
 					nodeInfo.pNode = anchorNode;
 					m_anchorNodeInfoArray.push_back(nodeInfo);
 				}
 			}			
 		}
 		m_anchorPosition.clear();		
 		m_anchorPosition.resize(m_anchorNodeInfoArray.size());
 		for ( int anchorNode = 0; anchorNode < m_anchorNodeInfoArray.size(); anchorNode++ )
 		{
 			const AnchorNodeInfoCL& anchorNodeInfo = m_anchorNodeInfoArray[anchorNode];
 			m_anchorIndex[anchorNodeInfo.clVertexIndex] = anchorNode;
 			getVertexData().getInverseMass(anchorNodeInfo.clVertexIndex) = 0.0f;
 		}
 		updateConstants(0.f);
 		// set position and velocity iterations
 		setNumberOfPositionIterations(maxPiterations);
 		setNumberOfVelocityIterations(maxViterations);
 		// set wind velocity
 		m_perClothWindVelocity.resize( m_softBodySet.size() );
 		for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
 		{
 			btSoftBody *softBody = m_softBodySet[softBodyIndex]->getSoftBody();			
 			m_perClothWindVelocity[softBodyIndex] = toVector3(softBody->getWindVelocity());
 		}
 		m_clPerClothWindVelocity.changedOnCPU();
 		// generate batches
 		m_linkData.generateBatches();		
 		m_triangleData.generateBatches();
 		// Build the shaders to match the batching parameters
 		buildShaders();
 	}
@@ -509,7 +550,9 @@ bool btOpenCLSoftBodySolverSIMDAware::buildShaders()
 	if( m_shadersInitialized )
 		return true;
-	m_clFunctions.clearKernelCompilationFailures();
+	const char* additionalMacros="";
 	m_currentCLFunctions->clearKernelCompilationFailures();
 	char *wavefrontMacros = new char[256];
@@ -522,22 +565,23 @@ bool btOpenCLSoftBodySolverSIMDAware::buildShaders()
 		WAVEFRONT_BLOCK_MULTIPLIER,
 		WAVEFRONT_BLOCK_MULTIPLIER*m_linkData.getWavefrontSize());
-	m_updatePositionsFromVelocitiesKernel = m_clFunctions.compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel", "" );
+	m_updatePositionsFromVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel", additionalMacros,"OpenCLC10/UpdatePositionsFromVelocities.cl");
-	m_solvePositionsFromLinksKernel = m_clFunctions.compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel", wavefrontMacros );
+	m_solvePositionsFromLinksKernel = m_currentCLFunctions->compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel", wavefrontMacros ,"OpenCLC10/SolvePositionsSIMDBatched.cl");
-	m_updateVelocitiesFromPositionsWithVelocitiesKernel = m_clFunctions.compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel", "" );
+	m_updateVelocitiesFromPositionsWithVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel", additionalMacros ,"OpenCLC10/UpdateNodes.cl");
-	m_updateVelocitiesFromPositionsWithoutVelocitiesKernel = m_clFunctions.compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel", "" );
+	m_updateVelocitiesFromPositionsWithoutVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel", additionalMacros,"OpenCLC10/UpdatePositions.cl");
-	m_integrateKernel = m_clFunctions.compileCLKernelFromString( IntegrateCLString, "IntegrateKernel", "" );
+	m_integrateKernel = m_currentCLFunctions->compileCLKernelFromString( IntegrateCLString, "IntegrateKernel", additionalMacros ,"OpenCLC10/Integrate.cl");
-	m_applyForcesKernel = m_clFunctions.compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel", "" );
+	m_applyForcesKernel = m_currentCLFunctions->compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel", additionalMacros,"OpenCLC10/ApplyForces.cl" );
-	m_solveCollisionsAndUpdateVelocitiesKernel = m_clFunctions.compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel", "" );
+	m_updateFixedVertexPositionsKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateFixedVertexPositionsCLString, "UpdateFixedVertexPositions" ,additionalMacros,"OpenCLC10/UpdateFixedVertexPositions.cl");
 	m_solveCollisionsAndUpdateVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel", additionalMacros ,"OpenCLC10/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl");
 	// TODO: Rename to UpdateSoftBodies
-	m_resetNormalsAndAreasKernel = m_clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel", "" );
+	m_resetNormalsAndAreasKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel", additionalMacros ,"OpenCLC10/UpdateNormals.cl");
-	m_normalizeNormalsAndAreasKernel = m_clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel", "" );
+	m_normalizeNormalsAndAreasKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel", additionalMacros ,"OpenCLC10/UpdateNormals.cl");
-	m_updateSoftBodiesKernel = m_clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel", "" );
+	m_updateSoftBodiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel", additionalMacros ,"OpenCLC10/UpdateNormals.cl");
 	delete [] wavefrontMacros;
-	if( m_clFunctions.getKernelCompilationFailures()==0)
+	if( m_currentCLFunctions->getKernelCompilationFailures()==0)
 	{
 		m_shadersInitialized = true;
 	}
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h
@@ -39,7 +39,7 @@ protected:
-	bool buildShaders();
+	virtual bool buildShaders();
 	void updateConstants( float timeStep );
@@ -59,7 +59,7 @@ protected:
 	/////////////////////////////////////
 public:
-	btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue,cl_context	ctx);
+	btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue,cl_context	ctx, bool bUpdateAchchoredNodePos = false);
 	virtual ~btOpenCLSoftBodySolverSIMDAware();
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h
@@ -561,10 +561,14 @@ public:
 	}
 	/**
-	 * Move data from host memory from the accelerator.
+	 * Move data to host memory from the accelerator if bCopy is false.
 	 * If bCopy is true, copy data to host memory from the accelerator so that data 
 	 * won't be moved to accelerator when moveToAccelerator() is called next time. 
 	 * If bCopyMinimum is true, only vertex position and normal are copied.
 	 * bCopyMinimum will be meaningful only if bCopy is true.
 	 * The CPU version will always return that it has moved it.
 	 */
-	virtual bool moveFromAccelerator()
+	virtual bool moveFromAccelerator(bool bCopy = false, bool bCopyMinimum = true)
 	{
 		return true;
 	}