Apply a patch that updates vertex position on GPU for the OpenCL version, by Dongsoo Han (saggita), work-in-progress

Removed the unused OpenCL kernels Add example how to cache binary kernels, see SoftDemo compiled with OpenCL AMD using msvc/vs2008_opencl.bat
2011-12-20 18:03:24 +00:00
parent 5cb40ed6e3
commit 1f7a67b7da
31 changed files with 441 additions and 1386 deletions
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverVertexData_DX11.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverVertexData_DX11.h
@@ -55,7 +55,7 @@ public:
 	virtual bool onAccelerator();
 	virtual bool moveToAccelerator();

-	virtual bool moveFromAccelerator();
+	virtual bool moveFromAccelerator(bool bCopy = false, bool bCopyMinimum = true);
 };


--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
@@ -308,18 +308,42 @@ bool btSoftBodyVertexDataDX11::moveToAccelerator()
 	return success;
 }

-bool btSoftBodyVertexDataDX11::moveFromAccelerator()
+bool btSoftBodyVertexDataDX11::moveFromAccelerator(bool bCopy, bool bCopyMinimum)
 {
 	bool success = true;
-	success = success && m_dx11ClothIdentifier.moveFromGPU();
-	success = success && m_dx11VertexPosition.moveFromGPU();
-	success = success && m_dx11VertexPreviousPosition.moveFromGPU();
-	success = success && m_dx11VertexVelocity.moveFromGPU();
-	success = success && m_dx11VertexForceAccumulator.moveFromGPU();
-	success = success && m_dx11VertexNormal.moveFromGPU();
-	success = success && m_dx11VertexInverseMass.moveFromGPU();
-	success = success && m_dx11VertexArea.moveFromGPU();
-	success = success && m_dx11VertexTriangleCount.moveFromGPU();
+
+	if (!bCopy)
+	{
+		success = success && m_dx11ClothIdentifier.moveFromGPU();
+		success = success && m_dx11VertexPosition.moveFromGPU();
+		success = success && m_dx11VertexPreviousPosition.moveFromGPU();
+		success = success && m_dx11VertexVelocity.moveFromGPU();
+		success = success && m_dx11VertexForceAccumulator.moveFromGPU();
+		success = success && m_dx11VertexNormal.moveFromGPU();
+		success = success && m_dx11VertexInverseMass.moveFromGPU();
+		success = success && m_dx11VertexArea.moveFromGPU();
+		success = success && m_dx11VertexTriangleCount.moveFromGPU();
+	}
+	else
+	{
+		if (bCopyMinimum)
+		{
+			success = success && m_dx11VertexPosition.copyFromGPU();
+			success = success && m_dx11VertexNormal.copyFromGPU();
+		}
+		else
+		{
+			success = success && m_dx11ClothIdentifier.copyFromGPU();
+			success = success && m_dx11VertexPosition.copyFromGPU();
+			success = success && m_dx11VertexPreviousPosition.copyFromGPU();
+			success = success && m_dx11VertexVelocity.copyFromGPU();
+			success = success && m_dx11VertexForceAccumulator.copyFromGPU();
+			success = success && m_dx11VertexNormal.copyFromGPU();
+			success = success && m_dx11VertexInverseMass.copyFromGPU();
+			success = success && m_dx11VertexArea.copyFromGPU();
+			success = success && m_dx11VertexTriangleCount.copyFromGPU();
+		}
+	}

 	if( success )
 		m_onGPU = true;
@@ -619,10 +643,10 @@ void btDX11SoftBodySolver::releaseKernels()
 }


-void btDX11SoftBodySolver::copyBackToSoftBodies()
+void btDX11SoftBodySolver::copyBackToSoftBodies(bool bMove)
 {
 	// Move the vertex data back to the host first
-	m_vertexData.moveFromAccelerator();
+	m_vertexData.moveFromAccelerator(!bMove);

 	// Loop over soft bodies, copying all the vertex positions back for each body in turn
 	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
@@ -607,7 +607,7 @@ public:

 	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);

-	virtual void copyBackToSoftBodies();
+	virtual void copyBackToSoftBodies(bool bMove = true);

 	virtual void solveConstraints( float solverdt );

--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt
@@ -34,7 +34,6 @@ SET(BulletSoftBodyOpenCLSolvers_HDRS
 ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_AMD
 	${BulletSoftBodyOpenCLSolvers_SRCS} 
 	${BulletSoftBodyOpenCLSolvers_HDRS} 
-	${BulletSoftBodyOpenCLSolvers_OpenCLC}
 )

 SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES VERSION ${BULLET_VERSION})
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Intel/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Intel/CMakeLists.txt
@@ -45,7 +45,7 @@ SET(BulletSoftBodyOpenCLSolvers_Shaders
 )

 foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
-    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC/${f}.cl")
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC10/${f}.cl")
 endforeach(f) 


--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
@@ -25,6 +25,7 @@ subject to the following restrictions:
 #include "../OpenCLC10/UpdatePositions.cl"
 #include "../OpenCLC10/UpdatePositionsFromVelocities.cl"
 #include "../OpenCLC10/VSolveLinks.cl"
+#include "../OpenCLC10/UpdateFixedVertexPositions.cl"
 //#include "../OpenCLC10/SolveCollisionsAndUpdateVelocities.cl"


@@ -39,7 +40,7 @@ MINICL_REGISTER(ApplyForcesKernel)
 MINICL_REGISTER(ResetNormalsAndAreasKernel)
 MINICL_REGISTER(NormalizeNormalsAndAreasKernel)
 MINICL_REGISTER(UpdateSoftBodiesKernel)
-
+MINICL_REGISTER(UpdateFixedVertexPositions)

 float mydot3a(float4 a, float4 b)
 {
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt
@@ -44,7 +44,7 @@ SET(BulletSoftBodyOpenCLSolvers_Shaders
 )

 foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
-    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC/${f}.cl")
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC10/${f}.cl")
 endforeach(f) 


--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ApplyForces.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ApplyForces.cl
@@ -1,91 +0,0 @@
-MSTRINGIFY(
-
-/*#define float3 float4
-float dot3(float3 a, float3 b)
-{
-   return a.x*b.x + a.y*b.y + a.z*b.z;
-}*/
-
-float3 projectOnAxis( float3 v, float3 a )
-{
-	return (a*dot(v, a));
-}
-
-__kernel void 
-ApplyForcesKernel(
-	const uint numNodes,
-	const float solverdt,
-	const float epsilon,
-	__global int * g_vertexClothIdentifier,
-	__global float4 * g_vertexNormal,
-	__global float * g_vertexArea,
-	__global float * g_vertexInverseMass,
-	__global float * g_clothLiftFactor,
-	__global float * g_clothDragFactor,
-	__global float4 * g_clothWindVelocity,
-	__global float4 * g_clothAcceleration,
-	__global float * g_clothMediumDensity,
-	__global float4 * g_vertexForceAccumulator,
-	__global float4 * g_vertexVelocity)
-{
-	unsigned int nodeID = get_global_id(0);
-	if( nodeID < numNodes )
-	{		
-		int clothId  = g_vertexClothIdentifier[nodeID];
-		float nodeIM = g_vertexInverseMass[nodeID];
-		
-		if( nodeIM > 0.0f )
-		{
-			float3 nodeV  = g_vertexVelocity[nodeID].xyz;
-			float3 normal = g_vertexNormal[nodeID].xyz;
-			float area    = g_vertexArea[nodeID];
-			float3 nodeF  = g_vertexForceAccumulator[nodeID].xyz;
-			
-			// Read per-cloth values
-			float3 clothAcceleration = g_clothAcceleration[clothId].xyz;
-			float3 clothWindVelocity = g_clothWindVelocity[clothId].xyz;
-			float liftFactor = g_clothLiftFactor[clothId];
-			float dragFactor = g_clothDragFactor[clothId];
-			float mediumDensity = g_clothMediumDensity[clothId];
-		
-			// Apply the acceleration to the cloth rather than do this via a force
-			nodeV += (clothAcceleration*solverdt);
-
-			g_vertexVelocity[nodeID] = (float4)(nodeV, 0.f);
-
-			float3 relativeWindVelocity = nodeV - clothWindVelocity;
-			float relativeSpeedSquared = dot(relativeWindVelocity, relativeWindVelocity);
-			
-			if( relativeSpeedSquared > epsilon )
-			{
-				// Correct direction of normal relative to wind direction and get dot product
-				normal = normal * (dot(normal, relativeWindVelocity) < 0 ? -1.f : 1.f);
-				float dvNormal = dot(normal, relativeWindVelocity);
-				if( dvNormal > 0 )
-				{
-					float3 force = (float3)(0.f, 0.f, 0.f);
-					float c0 = area * dvNormal * relativeSpeedSquared / 2.f;
-					float c1 = c0 * mediumDensity;
-					force += normal * (-c1 * liftFactor);
-					force += normalize(relativeWindVelocity)*(-c1 * dragFactor);
-					
-					float dtim = solverdt * nodeIM;
-					float3 forceDTIM = force * dtim;
-					
-					float3 nodeFPlusForce = nodeF + force;
-					
-					// m_nodesf[i] -= ProjectOnAxis(m_nodesv[i], force.normalized())/dtim;	
-					float3 nodeFMinus = nodeF - (projectOnAxis(nodeV, normalize(force))/dtim);
-					
-					nodeF = nodeFPlusForce;
-					if( dot(forceDTIM, forceDTIM) > dot(nodeV, nodeV) )
-						nodeF = nodeFMinus;
-									
-					g_vertexForceAccumulator[nodeID] = (float4)(nodeF, 0.0f);	
-				}
-			}
-		}
-	}
-}
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ComputeBounds.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ComputeBounds.cl
@@ -1,80 +0,0 @@
-MSTRINGIFY(
-#pragma OPENCL EXTENSION cl_amd_printf : enable \n
-
-
-
-__kernel void
-ComputeBoundsKernel( 
-	int numNodes,
-	int numSoftBodies,
-	__global int * g_vertexClothIdentifier,
-	__global float4 * g_vertexPositions,
-	volatile __global uint * g_clothMinBounds,
-	volatile __global uint * g_clothMaxBounds,
-	volatile __local uint * clothMinBounds,
-	volatile __local uint * clothMaxBounds)
-{
-	// Init min and max bounds arrays
-	if( get_local_id(0) < numSoftBodies )
-	{
-		
-		clothMinBounds[get_local_id(0)*4] = UINT_MAX;
-		clothMinBounds[get_local_id(0)*4+1] = UINT_MAX;
-		clothMinBounds[get_local_id(0)*4+2] = UINT_MAX;
-		clothMinBounds[get_local_id(0)*4+3] = UINT_MAX;
-		clothMaxBounds[get_local_id(0)*4] = 0;
-		clothMaxBounds[get_local_id(0)*4+1] = 0;
-		clothMaxBounds[get_local_id(0)*4+2] = 0;
-		clothMaxBounds[get_local_id(0)*4+3] = 0;
-
-	}
-
-
-	barrier(CLK_GLOBAL_MEM_FENCE);
-
-	int nodeID = get_global_id(0);
-	if( nodeID < numNodes )
-	{	
-		int clothIdentifier = g_vertexClothIdentifier[get_global_id(0)];
-		if( clothIdentifier >= 0 )
-		{
-			float3 position = g_vertexPositions[get_global_id(0)].xyz;
-
-			/* Reinterpret position as uint */
-			uint3 positionUInt = (uint3)(as_uint(position.x), as_uint(position.y), as_uint(position.z));
-		
-			/* Invert sign bit of positives and whole of negatives to allow comparison as unsigned ints */
-			positionUInt.x ^= (1+~(positionUInt.x >> 31) | 0x80000000);
-			positionUInt.y ^= (1+~(positionUInt.y >> 31) | 0x80000000);		
-			positionUInt.z ^= (1+~(positionUInt.z >> 31) | 0x80000000);
-		
-			/* Min/max with the LDS values */
-			atomic_min(&(clothMinBounds[clothIdentifier*4]), positionUInt.x);
-			atomic_min(&(clothMinBounds[clothIdentifier*4+1]), positionUInt.y);
-			atomic_min(&(clothMinBounds[clothIdentifier*4+2]), positionUInt.z);
-
-			atomic_max(&(clothMaxBounds[clothIdentifier*4]), positionUInt.x);
-			atomic_max(&(clothMaxBounds[clothIdentifier*4+1]), positionUInt.y);
-			atomic_max(&(clothMaxBounds[clothIdentifier*4+2]), positionUInt.z);
-		}
-	}
-	
-	barrier(CLK_GLOBAL_MEM_FENCE);
-
-
-	/* Use global atomics to update the global versions of the data*/
-	if( get_local_id(0) < numSoftBodies )
-	{
-		/*atomic_min(&(g_clothMinBounds[get_local_id(0)].x), clothMinBounds[get_local_id(0)].x);*/
-		atomic_min(&(g_clothMinBounds[get_local_id(0)*4]), clothMinBounds[get_local_id(0)*4]);
-		atomic_min(&(g_clothMinBounds[get_local_id(0)*4+1]), clothMinBounds[get_local_id(0)*4+1]);
-		atomic_min(&(g_clothMinBounds[get_local_id(0)*4+2]), clothMinBounds[get_local_id(0)*4+2]);
-
-		atomic_max(&(g_clothMaxBounds[get_local_id(0)*4]), clothMaxBounds[get_local_id(0)*4]);		
-		atomic_max(&(g_clothMaxBounds[get_local_id(0)*4+1]), clothMaxBounds[get_local_id(0)*4+1]);
-		atomic_max(&(g_clothMaxBounds[get_local_id(0)*4+2]), clothMaxBounds[get_local_id(0)*4+2]);
-	}
-}
-
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/Integrate.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/Integrate.cl
@@ -1,35 +0,0 @@
-MSTRINGIFY(
-
-// Node indices for each link
-
-//#define float3 float4
-
-__kernel void
-IntegrateKernel( 
-	const int numNodes,
-	const float solverdt,
-	__global float * g_vertexInverseMasses,
-	__global float4 * g_vertexPositions,
-	__global float4 * g_vertexVelocity,
-	__global float4 * g_vertexPreviousPositions,
-	__global float4 * g_vertexForceAccumulator)
-{
-	int nodeID = get_global_id(0);
-	if( nodeID < numNodes )
-	{	
-		float3 position   = g_vertexPositions[nodeID].xyz;
-		float3 velocity   = g_vertexVelocity[nodeID].xyz;
-		float3 force      = g_vertexForceAccumulator[nodeID].xyz;
-		float inverseMass = g_vertexInverseMasses[nodeID];
-		
-		g_vertexPreviousPositions[nodeID] = (float4)(position, 0.f);
-		velocity += force * inverseMass * solverdt;
-		position += velocity * solverdt;
-		
-		g_vertexForceAccumulator[nodeID] = (float4)(0.f, 0.f, 0.f, 0.0f);
-		g_vertexPositions[nodeID]        = (float4)(position, 0.f);
-		g_vertexVelocity[nodeID]         = (float4)(velocity, 0.f);	
-	}
-}
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/OutputToVertexArray.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/OutputToVertexArray.cl
@@ -1,46 +0,0 @@
-MSTRINGIFY(
-
-cbuffer OutputToVertexArrayCB : register( b0 )
-{
-	int startNode;
-	int numNodes;
-	int offsetX;
-	int strideX;
-	
-	int offsetN;	
-	int strideN;
-	int padding1;
-	int padding2;
-};
-
-
-StructuredBuffer<float4> g_nodesx : register( t0 );
-StructuredBuffer<float4> g_nodesn : register( t1 );
-
-RWStructuredBuffer<float> g_vertexBuffer : register( u0 );
-
-
-[numthreads(128, 1, 1)]
-void 
-OutputToVertexArrayKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
-{
-	int nodeID = DTid.x;
-	if( nodeID < numNodes )
-	{			
-		float4 nodeX = g_nodesx[nodeID + startNode];
-		float4 nodeN = g_nodesn[nodeID + startNode];
-		
-		// Stride should account for the float->float4 conversion
-		int positionDestination = nodeID * strideX + offsetX;		
-		g_vertexBuffer[positionDestination] = nodeX.x;
-		g_vertexBuffer[positionDestination+1] = nodeX.y;
-		g_vertexBuffer[positionDestination+2] = nodeX.z;
-		
-		int normalDestination = nodeID * strideN + offsetN;
-		g_vertexBuffer[normalDestination] = nodeN.x;
-		g_vertexBuffer[normalDestination+1] = nodeN.y;
-		g_vertexBuffer[normalDestination+2] = nodeN.z;		
-	}
-}
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/PrepareLinks.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/PrepareLinks.cl
@@ -1,34 +0,0 @@
-MSTRINGIFY(
-
-__kernel void 
-PrepareLinksKernel( 
-	const int numLinks,
-	__global int2 * g_linksVertexIndices,
-	__global float * g_linksMassLSC,
-	__global float4 * g_nodesPreviousPosition,
-	__global float * g_linksLengthRatio,
-	__global float4 * g_linksCurrentLength)
-{
-	int linkID = get_global_id(0);
-	if( linkID < numLinks )
-	{	
-		int2 nodeIndices = g_linksVertexIndices[linkID];
-		int node0 = nodeIndices.x;
-		int node1 = nodeIndices.y;
-		
-		float4 nodePreviousPosition0 = g_nodesPreviousPosition[node0];
-		float4 nodePreviousPosition1 = g_nodesPreviousPosition[node1];
-
-		float massLSC = g_linksMassLSC[linkID];
-		
-		float4 linkCurrentLength = nodePreviousPosition1 - nodePreviousPosition0;
-		
-		float linkLengthRatio = dot(linkCurrentLength, linkCurrentLength)*massLSC;
-		linkLengthRatio = 1.0f/linkLengthRatio;
-		
-		g_linksCurrentLength[linkID] = linkCurrentLength;
-		g_linksLengthRatio[linkID]   = linkLengthRatio;		
-	}
-}
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolveCollisionsAndUpdateVelocities.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolveCollisionsAndUpdateVelocities.cl
@@ -1,195 +0,0 @@
-MSTRINGIFY(
-
-typedef struct 
-{
-	int firstObject;
-	int endObject;
-} CollisionObjectIndices;
-
-typedef struct 
-{
-	float4 shapeTransform[4]; // column major 4x4 matrix
-	float4 linearVelocity;
-	float4 angularVelocity;
-
-	int softBodyIdentifier;
-	int collisionShapeType;
-	
-
-	// Shape information
-	// Compressed from the union
-	float radius;
-	float halfHeight;
-	int upAxis;
-		
-	float margin;
-	float friction;
-
-	int padding0;
-	
-} CollisionShapeDescription;
-
-/* From btBroadphaseProxy.h */
-__constant int CAPSULE_SHAPE_PROXYTYPE = 10;
-
-/* Multiply column-major matrix against vector */
-float4 matrixVectorMul( float4 matrix[4], float4 vector )
-{
-	float4 returnVector;
-	float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
-	float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
-	float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
-	float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
-	returnVector.x = dot(row0, vector);
-	returnVector.y = dot(row1, vector);
-	returnVector.z = dot(row2, vector);
-	returnVector.w = dot(row3, vector);
-	return returnVector;
-}
-
-__kernel void 
-SolveCollisionsAndUpdateVelocitiesKernel( 
-	const int numNodes,
-	const float isolverdt,
-	__global int *g_vertexClothIdentifier,
-	__global float4 *g_vertexPreviousPositions,
-	__global float * g_perClothFriction,
-	__global float * g_clothDampingFactor,
-	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
-	__global CollisionShapeDescription * g_collisionObjectDetails,
-	__global float4 * g_vertexForces,
-	__global float4 *g_vertexVelocities,
-	__global float4 *g_vertexPositions)
-{
-	int nodeID = get_global_id(0);
-	float3 forceOnVertex = (float3)(0.f, 0.f, 0.f);
-	if( get_global_id(0) < numNodes )
-	{	
-		int clothIdentifier = g_vertexClothIdentifier[nodeID];
-		
-		// Abort if this is not a valid cloth
-		if( clothIdentifier < 0 )
-			return;
-
-		float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f);
-		float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f);
-		float3 velocity;
-		float clothFriction = g_perClothFriction[clothIdentifier];
-		float dampingFactor = g_clothDampingFactor[clothIdentifier];
-		float velocityCoefficient = (1.f - dampingFactor);		
-		CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
-	
-		if( collisionObjectIndices.firstObject != collisionObjectIndices.endObject )
-		{
-			velocity = (float3)(15, 0, 0);
-
-			/* We have some possible collisions to deal with */
-			for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision )
-			{
-				CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision];
-				float colliderFriction = shapeDescription.friction;
-		
-				if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
-				{
-					/* Colliding with a capsule */
-
-					float capsuleHalfHeight = shapeDescription.halfHeight;
-					float capsuleRadius = shapeDescription.radius;
-					float capsuleMargin = shapeDescription.margin;
-					int capsuleupAxis = shapeDescription.upAxis;
-
-					/* Four columns of worldTransform matrix */
-					float4 worldTransform[4];
-					worldTransform[0] = shapeDescription.shapeTransform[0];
-					worldTransform[1] = shapeDescription.shapeTransform[1];
-					worldTransform[2] = shapeDescription.shapeTransform[2];
-					worldTransform[3] = shapeDescription.shapeTransform[3];
-
-					// Correctly define capsule centerline vector 
-					float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); 
-					float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f);
-					c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
-					c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
-					c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
-					c2.x = -c1.x;
-					c2.y = -c1.y;
-					c2.z = -c1.z;
-
-					float4 worldC1 = matrixVectorMul(worldTransform, c1);
-					float4 worldC2 = matrixVectorMul(worldTransform, c2);
-					float3 segment = (worldC2 - worldC1).xyz;
-
-					/* compute distance of tangent to vertex along line segment in capsule */
-					float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) );
-
-					float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment, 0.f));
-					float distanceFromLine = length(position - closestPoint);
-					float distanceFromC1 = length(worldC1 - position);
-					float distanceFromC2 = length(worldC2 - position);
-					
-					/* Final distance from collision, point to push from, direction to push in
-					   for impulse force */
-					float dist;
-					float3 normalVector;
-					if( distanceAlongSegment < 0 )
-					{
-						dist = distanceFromC1;
-						normalVector = normalize(position - worldC1).xyz;
-					} else if( distanceAlongSegment > 1.f ) {
-						dist = distanceFromC2;
-						normalVector = normalize(position - worldC2).xyz;	
-					} else {
-						dist = distanceFromLine;
-						normalVector = normalize(position - closestPoint).xyz;
-					}
-						
-					float3 colliderLinearVelocity = shapeDescription.linearVelocity.xyz;
-					float3 colliderAngularVelocity = shapeDescription.angularVelocity.xyz;
-					float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz - (float3)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w));
-
-					float minDistance = capsuleRadius + capsuleMargin;
-					
-					/* In case of no collision, this is the value of velocity */
-					velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
-					
-					
-					// Check for a collision
-					if( dist < minDistance )
-					{
-						/* Project back to surface along normal */
-						position = position + (float4)((minDistance - dist)*normalVector*0.9f, 0.f);
-						velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
-						float3 relativeVelocity = velocity - velocityOfSurfacePoint;
-
-						float3 p1 = normalize(cross(normalVector, segment));
-						float3 p2 = normalize(cross(p1, normalVector));
-						/* Full friction is sum of velocities in each direction of plane */
-						float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2);
-
-						/* Real friction is peak friction corrected by friction coefficients */
-						frictionVector = frictionVector * (colliderFriction*clothFriction);
-
-						float approachSpeed = dot(relativeVelocity, normalVector);
-
-						if( approachSpeed <= 0.0f )
-							forceOnVertex -= frictionVector;
-					}
-					
-				}
-			}
-		} else {
-			/* Update velocity	 */
-			float3 difference = position.xyz - previousPosition.xyz;
-			velocity = difference*velocityCoefficient*isolverdt;			
-		}
-
-		g_vertexVelocities[nodeID] = (float4)(velocity, 0.f);	
-
-		/* Update external force */
-		g_vertexForces[nodeID] = (float4)(forceOnVertex, 0.f);
-
-		g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f);
-	}
-}
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl
@@ -1,213 +0,0 @@
-MSTRINGIFY(
-
-typedef struct 
-{
-	int firstObject;
-	int endObject;
-} CollisionObjectIndices;
-
-typedef struct 
-{
-	float4 shapeTransform[4]; /* column major 4x4 matrix */
-	float4 linearVelocity;
-	float4 angularVelocity;
-
-	int softBodyIdentifier;
-	int collisionShapeType;
-	
-
-	// Shape information
-	// Compressed from the union
-	float radius;
-	float halfHeight;
-	int upAxis;
-		
-	float margin;
-	float friction;
-
-	int padding0;
-	
-} CollisionShapeDescription;
-
-/* From btBroadphaseProxy.h */
-__constant int CAPSULE_SHAPE_PROXYTYPE = 10;
-
-
-/* Multiply column-major matrix against vector */
-float4 matrixVectorMul( float4 matrix[4], float4 vector )
-{
-	float4 returnVector;
-	float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x);
-	float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y);
-	float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z);
-	float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w);
-	returnVector.x = dot(row0, vector);
-	returnVector.y = dot(row1, vector);
-	returnVector.z = dot(row2, vector);
-	returnVector.w = dot(row3, vector);
-	return returnVector;
-}
-
-__kernel void 
-SolveCollisionsAndUpdateVelocitiesKernel( 
-	const int numNodes,
-	const float isolverdt,
-	__global int *g_vertexClothIdentifier,
-	__global float4 *g_vertexPreviousPositions,
-	__global float * g_perClothFriction,
-	__global float * g_clothDampingFactor,
-	__global CollisionObjectIndices * g_perClothCollisionObjectIndices,
-	__global CollisionShapeDescription * g_collisionObjectDetails,
-	__global float4 * g_vertexForces,
-	__global float4 *g_vertexVelocities,
-	__global float4 *g_vertexPositions,
-	__local CollisionShapeDescription *localCollisionShapes)
-{
-	int nodeID = get_global_id(0);
-	float3 forceOnVertex = (float3)(0.f, 0.f, 0.f);
-
-	int clothIdentifier = g_vertexClothIdentifier[nodeID];
-	
-	// Abort if this is not a valid cloth
-	if( clothIdentifier < 0 )
-		return;
-
-	float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f);
-	float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f);
-	float3 velocity;
-	float clothFriction = g_perClothFriction[clothIdentifier];
-	float dampingFactor = g_clothDampingFactor[clothIdentifier];
-	float velocityCoefficient = (1.f - dampingFactor);		
-	CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier];
-	
-	int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
-	if( numObjects > 0 )
-	{
-		/* We have some possible collisions to deal with */
-		
-		/* First load all of the collision objects into LDS */
-		int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject;
-		if( get_local_id(0) < numObjects )
-		{
-			localCollisionShapes[get_local_id(0)] = g_collisionObjectDetails[ collisionObjectIndices.firstObject + get_local_id(0) ];
-		}
-	}
-
-	/* Safe as the vertices are padded so that not more than one soft body is in a group */
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	/* Annoyingly, even though I know the flow control is not varying, the compiler will not let me skip this */
-	if( numObjects > 0 )
-	{
-		velocity = (float3)(0, 0, 0);
-		
-		
-		// We have some possible collisions to deal with
-		for( int collision = 0; collision < numObjects; ++collision )
-		{
-			CollisionShapeDescription shapeDescription = localCollisionShapes[collision];
-			float colliderFriction = shapeDescription.friction;
-		
-			if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE )
-			{
-				/* Colliding with a capsule */
-
-				float capsuleHalfHeight = localCollisionShapes[collision].halfHeight;
-				float capsuleRadius = localCollisionShapes[collision].radius;
-				float capsuleMargin = localCollisionShapes[collision].margin;
-				int capsuleupAxis = localCollisionShapes[collision].upAxis;
-
-				float4 worldTransform[4];
-				worldTransform[0] = localCollisionShapes[collision].shapeTransform[0];
-				worldTransform[1] = localCollisionShapes[collision].shapeTransform[1];
-				worldTransform[2] = localCollisionShapes[collision].shapeTransform[2];
-				worldTransform[3] = localCollisionShapes[collision].shapeTransform[3];
-
-				// Correctly define capsule centerline vector 
-				float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); 
-				float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f);
-				c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 );
-				c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 );
-				c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 );
-				c2.x = -c1.x;
-				c2.y = -c1.y;
-				c2.z = -c1.z;
-
-				float4 worldC1 = matrixVectorMul(worldTransform, c1);
-				float4 worldC2 = matrixVectorMul(worldTransform, c2);
-				float3 segment = (worldC2 - worldC1).xyz;
-
-
-				/* compute distance of tangent to vertex along line segment in capsule */
-				float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) );
-
-				float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment, 0.f));
-				float distanceFromLine = length(position - closestPoint);
-				float distanceFromC1 = length(worldC1 - position);
-				float distanceFromC2 = length(worldC2 - position);
-					
-				/* Final distance from collision, point to push from, direction to push in
-				   for impulse force */
-				float dist;
-				float3 normalVector;
-				if( distanceAlongSegment < 0 )
-				{
-					dist = distanceFromC1;
-					normalVector = normalize(position - worldC1).xyz;
-				} else if( distanceAlongSegment > 1.f ) {
-					dist = distanceFromC2;
-					normalVector = normalize(position - worldC2).xyz;	
-				} else {
-					dist = distanceFromLine;
-					normalVector = normalize(position - closestPoint).xyz;
-				}
-						
-				float3 colliderLinearVelocity = localCollisionShapes[collision].linearVelocity.xyz;
-				float3 colliderAngularVelocity = localCollisionShapes[collision].angularVelocity.xyz;
-				float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz -  (float3)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w));
-
-				float minDistance = capsuleRadius + capsuleMargin;
-					
-				/* In case of no collision, this is the value of velocity */
-				velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
-					
-					
-				/* Check for a collision */
-				if( dist < minDistance )
-				{
-					/* Project back to surface along normal */
-					position = position + (float4)((minDistance - dist)*normalVector*0.9f, 0.f);
-					velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt;
-					float3 relativeVelocity = velocity - velocityOfSurfacePoint;
-
-					float3 p1 = normalize(cross(normalVector, segment));
-					float3 p2 = normalize(cross(p1, normalVector));
-					/* Full friction is sum of velocities in each direction of plane */
-					float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2);
-
-					/* Real friction is peak friction corrected by friction coefficients */
-					frictionVector = frictionVector * (colliderFriction*clothFriction);
-
-					float approachSpeed = dot(relativeVelocity, normalVector);
-
-					if( approachSpeed <= 0.0f )
-						forceOnVertex -= frictionVector;
-				}
-					
-			}
-		}
-	} else {
-		/* Update velocity	*/
-		float3 difference = position.xyz - previousPosition.xyz;
-		velocity = difference*velocityCoefficient*isolverdt;			
-	}
-
-	g_vertexVelocities[nodeID] = (float4)(velocity, 0.f);	
-
-	/* Update external force */
-	g_vertexForces[nodeID] = (float4)(forceOnVertex, 0.f);
-
-	g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f);
-}
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositions.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositions.cl
@@ -1,55 +0,0 @@
-
-MSTRINGIFY(
-
-/*#define float3 float4
-
-float dot3(float3 a, float3 b)
-{
-   return a.x*b.x + a.y*b.y + a.z*b.z;
-}*/
-
-__kernel void 
-SolvePositionsFromLinksKernel( 
-	const int startLink,
-	const int numLinks,
-	const float kst,
-	const float ti,
-	__global int2 * g_linksVertexIndices,
-	__global float * g_linksMassLSC,
-	__global float * g_linksRestLengthSquared,
-	__global float * g_verticesInverseMass,
-	__global float4 * g_vertexPositions)
-	
-{
-	int linkID = get_global_id(0) + startLink;
-	if( get_global_id(0) < numLinks )
-	{	
-		float massLSC = g_linksMassLSC[linkID];
-		float restLengthSquared = g_linksRestLengthSquared[linkID];
-		
-		if( massLSC > 0.0f )
-		{		
-			int2 nodeIndices = g_linksVertexIndices[linkID];
-			int node0 = nodeIndices.x;
-			int node1 = nodeIndices.y;
-			
-			float3 position0 = g_vertexPositions[node0].xyz;
-			float3 position1 = g_vertexPositions[node1].xyz;
-
-			float inverseMass0 = g_verticesInverseMass[node0];
-			float inverseMass1 = g_verticesInverseMass[node1]; 
-
-			float3 del = position1 - position0;
-			float len  = dot(del, del);
-			float k    = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
-			position0 = position0 - del*(k*inverseMass0);
-			position1 = position1 + del*(k*inverseMass1);
-
-			g_vertexPositions[node0] = (float4)(position0, 0.f);
-			g_vertexPositions[node1] = (float4)(position1, 0.f);
-
-		}
-	}
-}
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositionsSIMDBatched.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositionsSIMDBatched.cl
@@ -1,129 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-MSTRINGIFY(
-
-float mydot3(float4 a, float4 b)
-{
-   return a.x*b.x + a.y*b.y + a.z*b.z;
-}
-
-__kernel void 
-SolvePositionsFromLinksKernel( 
-	const int startWaveInBatch,
-	const int numWaves,
-	const float kst,
-	const float ti,
-	__global int2 *g_wavefrontBatchCountsVertexCounts,
-	__global int *g_vertexAddressesPerWavefront,
-	__global int2 * g_linksVertexIndices,
-	__global float * g_linksMassLSC,
-	__global float * g_linksRestLengthSquared,
-	__global float * g_verticesInverseMass,
-	__global float4 * g_vertexPositions,
-	__local int2 *wavefrontBatchCountsVertexCounts,
-	__local float4 *vertexPositionSharedData,
-	__local float *vertexInverseMassSharedData)
-{
-	const int laneInWavefront = (get_global_id(0) & (WAVEFRONT_SIZE-1));
-	const int wavefront = startWaveInBatch + (get_global_id(0) / WAVEFRONT_SIZE);
-	const int firstWavefrontInBlock = startWaveInBatch + get_group_id(0) * WAVEFRONT_BLOCK_MULTIPLIER;
-	const int localWavefront = wavefront - firstWavefrontInBlock;
-
-	// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier	
-	if( wavefront < (startWaveInBatch + numWaves) )
-	{	
-		// Load the batch counts for the wavefronts
-		
-		int2 batchesAndVerticesWithinWavefront = g_wavefrontBatchCountsVertexCounts[wavefront];
-		int batchesWithinWavefront = batchesAndVerticesWithinWavefront.x;
-		int verticesUsedByWave = batchesAndVerticesWithinWavefront.y;
-
-		// Load the vertices for the wavefronts
-		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
-		{
-			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
-
-			vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_vertexPositions[vertexAddress];
-			vertexInverseMassSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_verticesInverseMass[vertexAddress];
-		}
-		
-		mem_fence(CLK_LOCAL_MEM_FENCE);
-
-		// Loop through the batches performing the solve on each in LDS
-		int baseDataLocationForWave = WAVEFRONT_SIZE * wavefront * MAX_BATCHES_PER_WAVE;	
-
-		//for( int batch = 0; batch < batchesWithinWavefront; ++batch )
-		
-		int batch = 0;
-		do
-		{
-			int baseDataLocation = baseDataLocationForWave + WAVEFRONT_SIZE * batch;
-			int locationOfValue = baseDataLocation + laneInWavefront;
-			
-			
-			// These loads should all be perfectly linear across the WF
-			int2 localVertexIndices = g_linksVertexIndices[locationOfValue];
-			float massLSC = g_linksMassLSC[locationOfValue];
-			float restLengthSquared = g_linksRestLengthSquared[locationOfValue];
-			
-			// LDS vertex addresses based on logical wavefront number in block and loaded index
-			int vertexAddress0 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.x;
-			int vertexAddress1 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.y;
-			
-			float4 position0 = vertexPositionSharedData[vertexAddress0];
-			float4 position1 = vertexPositionSharedData[vertexAddress1];
-
-			float inverseMass0 = vertexInverseMassSharedData[vertexAddress0];
-			float inverseMass1 = vertexInverseMassSharedData[vertexAddress1]; 
-
-			float4 del = position1 - position0;
-			float len = mydot3(del, del);
-			
-			float k = 0;
-			if( massLSC > 0.0f )
-			{		
-				k = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
-			}
-			
-			position0 = position0 - del*(k*inverseMass0);
-			position1 = position1 + del*(k*inverseMass1);
-			
-			// Ensure compiler does not re-order memory operations
-			mem_fence(CLK_LOCAL_MEM_FENCE);
-
-			vertexPositionSharedData[vertexAddress0] = position0;
-			vertexPositionSharedData[vertexAddress1] = position1;
-			
-			// Ensure compiler does not re-order memory operations
-			mem_fence(CLK_LOCAL_MEM_FENCE);
-				
-			
-			++batch;
-		} while( batch < batchesWithinWavefront );
-
-		// Update the global memory vertices for the wavefronts
-		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
-		{
-			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
-
-			g_vertexPositions[vertexAddress] = (float4)(vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex].xyz, 0.f);
-		}		
-		
-	}
-
-}
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdateConstants.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdateConstants.cl
@@ -1,44 +0,0 @@
-MSTRINGIFY(
-
-/*#define float3 float4
-
-float dot3(float3 a, float3 b)
-{
-   return a.x*b.x + a.y*b.y + a.z*b.z;
-}*/
-
-__kernel void 
-UpdateConstantsKernel( 
-	const int numLinks,
-	__global int2 * g_linksVertexIndices,
-	__global float4 * g_vertexPositions,
-	__global float * g_vertexInverseMasses,
-	__global float * g_linksMaterialLSC,
-	__global float * g_linksMassLSC,
-	__global float * g_linksRestLengthSquared,
-	__global float * g_linksRestLengths)
-{
-	int linkID = get_global_id(0);
-	if( linkID < numLinks )
-	{	
-		int2 nodeIndices = g_linksVertexIndices[linkID];
-		int node0 = nodeIndices.x;
-		int node1 = nodeIndices.y;
-		float linearStiffnessCoefficient = g_linksMaterialLSC[ linkID ];
-		
-		float3 position0   = g_vertexPositions[node0].xyz;
-		float3 position1   = g_vertexPositions[node1].xyz;
-		float inverseMass0 = g_vertexInverseMasses[node0];
-		float inverseMass1 = g_vertexInverseMasses[node1];
-
-		float3 difference = position0 - position1;
-		float length2 = dot(difference, difference);
-		float length = sqrt(length2);
-	
-		g_linksRestLengths[linkID] = length;
-		g_linksMassLSC[linkID] = (inverseMass0 + inverseMass1)/linearStiffnessCoefficient;
-		g_linksRestLengthSquared[linkID] = length*length;		
-	}
-}
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdateNodes.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdateNodes.cl
@@ -1,40 +0,0 @@
-MSTRINGIFY(
-
-//#define float3 float4
-
-__kernel void 
-updateVelocitiesFromPositionsWithVelocitiesKernel( 
-	int numNodes,
-	float isolverdt,
-	__global float4 * g_vertexPositions,
-	__global float4 * g_vertexPreviousPositions,
-	__global int * g_vertexClothIndices,
-	__global float *g_clothVelocityCorrectionCoefficients,
-	__global float * g_clothDampingFactor,
-	__global float4 * g_vertexVelocities,
-	__global float4 * g_vertexForces)
-{
-	int nodeID = get_global_id(0);
-	if( nodeID < numNodes )
-	{	
-		float3 position = g_vertexPositions[nodeID].xyz;
-		float3 previousPosition = g_vertexPreviousPositions[nodeID].xyz;
-		float3 velocity = g_vertexVelocities[nodeID].xyz;
-		int clothIndex = g_vertexClothIndices[nodeID];
-		float velocityCorrectionCoefficient = g_clothVelocityCorrectionCoefficients[clothIndex];
-		float dampingFactor = g_clothDampingFactor[clothIndex];
-		float velocityCoefficient = (1.f - dampingFactor);
-		
-		float3 difference = position - previousPosition;
-				
-		velocity += difference*velocityCorrectionCoefficient*isolverdt;
-		
-		// Damp the velocity
-		velocity *= velocityCoefficient;
-		
-		g_vertexVelocities[nodeID] = (float4)(velocity, 0.f);
-		g_vertexForces[nodeID] = (float4)(0.f, 0.f, 0.f, 0.f);								
-	}
-}
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdateNormals.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdateNormals.cl
@@ -1,103 +0,0 @@
-MSTRINGIFY(
-
-//#define float3 float4
-
-/*float length3(float3 a)
-{
-	a.w = 0;
-	return length(a);
-}
-
-float normalize3(float3 a)
-{
-	a.w = 0;
-	return normalize(a);
-}*/
-
-__kernel void 
-ResetNormalsAndAreasKernel(
-	const unsigned int numNodes,
-	__global float4 * g_vertexNormals,
-	__global float * g_vertexArea)
-{
-	if( get_global_id(0) < numNodes )
-	{
-		g_vertexNormals[get_global_id(0)] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
-		g_vertexArea[get_global_id(0)]    = 0.0f;
-	}
-}
-
-__kernel void 
-UpdateSoftBodiesKernel(
-	const unsigned int startFace,
-	const unsigned int numFaces,
-	__global int4 * g_triangleVertexIndexSet,
-	__global float4 * g_vertexPositions,
-	__global float4 * g_vertexNormals,
-	__global float * g_vertexArea,
-	__global float4 * g_triangleNormals,
-	__global float * g_triangleArea)
-{
-	int faceID = get_global_id(0) + startFace;
-	if( get_global_id(0) < numFaces )
-	{		
-		int4 triangleIndexSet = g_triangleVertexIndexSet[ faceID ];
-		int nodeIndex0 = triangleIndexSet.x;
-		int nodeIndex1 = triangleIndexSet.y;
-		int nodeIndex2 = triangleIndexSet.z;
-
-		float3 node0 = g_vertexPositions[nodeIndex0].xyz;
-		float3 node1 = g_vertexPositions[nodeIndex1].xyz;
-		float3 node2 = g_vertexPositions[nodeIndex2].xyz;
-		float3 nodeNormal0 = g_vertexNormals[nodeIndex0].xyz;
-		float3 nodeNormal1 = g_vertexNormals[nodeIndex1].xyz;
-		float3 nodeNormal2 = g_vertexNormals[nodeIndex2].xyz;
-		float vertexArea0 = g_vertexArea[nodeIndex0];
-		float vertexArea1 = g_vertexArea[nodeIndex1];
-		float vertexArea2 = g_vertexArea[nodeIndex2];
-		
-		float3 vector0 = node1 - node0;
-		float3 vector1 = node2 - node0;
-		
-		float3 faceNormal = cross(vector0.xyz, vector1.xyz);
-		float triangleArea = length(faceNormal);
-
-		nodeNormal0 = nodeNormal0 + faceNormal;
-		nodeNormal1 = nodeNormal1 + faceNormal;
-		nodeNormal2 = nodeNormal2 + faceNormal;
-		vertexArea0 = vertexArea0 + triangleArea;
-		vertexArea1 = vertexArea1 + triangleArea;
-		vertexArea2 = vertexArea2 + triangleArea;
-		
-		g_triangleNormals[faceID] = (float4)(normalize(faceNormal), 0.f);
-		g_vertexNormals[nodeIndex0] = (float4)(nodeNormal0, 0.f);
-		g_vertexNormals[nodeIndex1] = (float4)(nodeNormal1, 0.f);
-		g_vertexNormals[nodeIndex2] = (float4)(nodeNormal2, 0.f);
-		g_triangleArea[faceID] = triangleArea;
-		g_vertexArea[nodeIndex0] = vertexArea0;
-		g_vertexArea[nodeIndex1] = vertexArea1;
-		g_vertexArea[nodeIndex2] = vertexArea2;
-	}
-}
-
-__kernel void 
-NormalizeNormalsAndAreasKernel( 
-	const unsigned int numNodes,
-	__global int * g_vertexTriangleCount,
-	__global float4 * g_vertexNormals,
-	__global float * g_vertexArea)
-{
-	if( get_global_id(0) < numNodes )
-	{
-		float4 normal = g_vertexNormals[get_global_id(0)];
-		float area = g_vertexArea[get_global_id(0)];
-		int numTriangles = g_vertexTriangleCount[get_global_id(0)];
-		
-		float vectorLength = length(normal);
-		
-		g_vertexNormals[get_global_id(0)] = normalize(normal);
-		g_vertexArea[get_global_id(0)] = area/(float)(numTriangles);
-	}
-}
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdatePositions.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdatePositions.cl
@@ -1,36 +0,0 @@
-MSTRINGIFY(
-
-//#define float3 float4
-
-__kernel void 
-updateVelocitiesFromPositionsWithoutVelocitiesKernel( 
-	const int numNodes,
-	const float isolverdt,
-	__global float4 * g_vertexPositions,
-	__global float4 * g_vertexPreviousPositions,
-	__global int * g_vertexClothIndices,
-	__global float * g_clothDampingFactor,
-	__global float4 * g_vertexVelocities,
-	__global float4 * g_vertexForces)
-
-{
-	int nodeID = get_global_id(0);
-	if( nodeID < numNodes )
-	{	
-		float3 position = g_vertexPositions[nodeID].xyz;
-		float3 previousPosition = g_vertexPreviousPositions[nodeID].xyz;
-		float3 velocity = g_vertexVelocities[nodeID].xyz;
-		int clothIndex = g_vertexClothIndices[nodeID];
-		float dampingFactor = g_clothDampingFactor[clothIndex];
-		float velocityCoefficient = (1.f - dampingFactor);
-		
-		float3 difference = position - previousPosition;
-				
-		velocity = difference*velocityCoefficient*isolverdt;		
-		
-		g_vertexVelocities[nodeID] = (float4)(velocity, 0.f);
-		g_vertexForces[nodeID] = (float4)(0.f, 0.f, 0.f, 0.f);								
-	}
-}
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdatePositionsFromVelocities.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/UpdatePositionsFromVelocities.cl
@@ -1,26 +0,0 @@
-MSTRINGIFY(
-
-//#define float3 float4
-
-__kernel void 
-UpdatePositionsFromVelocitiesKernel( 
-	const int numNodes,
-	const float solverSDT,
-	__global float4 * g_vertexVelocities,
-	__global float4 * g_vertexPreviousPositions,
-	__global float4 * g_vertexCurrentPosition)
-{
-	int vertexID = get_global_id(0);
-	if( vertexID < numNodes )
-	{	
-		float3 previousPosition = g_vertexPreviousPositions[vertexID].xyz;
-		float3 velocity         = g_vertexVelocities[vertexID].xyz;
-		
-		float3 newPosition      = previousPosition + velocity*solverSDT;
-		
-		g_vertexCurrentPosition[vertexID]   = (float4)(newPosition, 0.f);
-		g_vertexPreviousPositions[vertexID] = (float4)(newPosition, 0.f);
-	}
-}
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/VSolveLinks.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/VSolveLinks.cl
@@ -1,45 +0,0 @@
-MSTRINGIFY(
-
-__kernel void 
-VSolveLinksKernel( 
-	int startLink,
-	int numLinks,
-	float kst,
-	__global int2 * g_linksVertexIndices,
-	__global float * g_linksLengthRatio,
-	__global float4 * g_linksCurrentLength,
-	__global float * g_vertexInverseMass,
-	__global float4 * g_vertexVelocity)
-{
-	int linkID = get_global_id(0) + startLink;
-	if( get_global_id(0) < numLinks )
-	{		
-		int2 nodeIndices = g_linksVertexIndices[linkID];
-		int node0 = nodeIndices.x;
-		int node1 = nodeIndices.y;
-		
-		float linkLengthRatio = g_linksLengthRatio[linkID];
-		float3 linkCurrentLength = g_linksCurrentLength[linkID].xyz;
-		
-		float3 vertexVelocity0 = g_vertexVelocity[node0].xyz;
-		float3 vertexVelocity1 = g_vertexVelocity[node1].xyz;
-
-		float vertexInverseMass0 = g_vertexInverseMass[node0];
-		float vertexInverseMass1 = g_vertexInverseMass[node1]; 
-
-		float3 nodeDifference = vertexVelocity0 - vertexVelocity1;
-		float dotResult = dot(linkCurrentLength, nodeDifference);
-		float j = -dotResult*linkLengthRatio*kst;
-		
-		float3 velocityChange0 = linkCurrentLength*(j*vertexInverseMass0);
-		float3 velocityChange1 = linkCurrentLength*(j*vertexInverseMass1);
-		
-		vertexVelocity0 += velocityChange0;
-		vertexVelocity1 -= velocityChange1;
-
-		g_vertexVelocity[node0] = (float4)(vertexVelocity0, 0.f);
-		g_vertexVelocity[node1] = (float4)(vertexVelocity1, 0.f);
-	}
-}
-
-);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ApplyForces.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ApplyForces.cl
@@ -6,6 +6,18 @@ float adot3(float4 a, float4 b)
   return a.x*b.x + a.y*b.y + a.z*b.z;
 }

+float alength3(float4 a)
+{
+	a.w = 0;
+	return length(a);
+}
+
+float4 anormalize3(float4 a)
+{
+	a.w = 0;
+	return normalize(a);
+}
+
 float4 projectOnAxis( float4 v, float4 a )
 {
 	return (a*adot3(v, a));
@@ -53,36 +65,35 @@ ApplyForcesKernel(

 			g_vertexVelocity[nodeID] = nodeV;

-			float4 relativeWindVelocity = nodeV - clothWindVelocity;
-			float relativeSpeedSquared = dot(relativeWindVelocity, relativeWindVelocity);
+			// Aerodynamics
+			float4 rel_v = nodeV - clothWindVelocity;
+			float rel_v_len = alength3(rel_v);
+			float rel_v2 = dot(rel_v, rel_v);
 			
-			if( relativeSpeedSquared > epsilon )
+			if( rel_v2 > epsilon )
 			{
-				// Correct direction of normal relative to wind direction and get dot product
-				normal = normal * (dot(normal, relativeWindVelocity) < 0 ? -1.f : 1.f);
-				float dvNormal = dot(normal, relativeWindVelocity);
-				if( dvNormal > 0 )
-				{
-					float4 force = (float4)(0.f, 0.f, 0.f, 0.f);
-					float c0 = area * dvNormal * relativeSpeedSquared / 2.f;
-					float c1 = c0 * mediumDensity;
-					force += normal * (-c1 * liftFactor);
-					force += normalize(relativeWindVelocity)*(-c1 * dragFactor);
+				float4 rel_v_nrm = anormalize3(rel_v);
+				float4 nrm = normal;
 									
-					float dtim = solverdt * nodeIM;
-					float4 forceDTIM = force * dtim;
+				nrm = nrm * (dot(nrm, rel_v) < 0 ? -1.f : 1.f);

-					float4 nodeFPlusForce = nodeF + force;
+				float4 fDrag = (float4)(0.f, 0.f, 0.f, 0.f);
+				float4 fLift = (float4)(0.f, 0.f, 0.f, 0.f);

-					// m_nodesf[i] -= ProjectOnAxis(m_nodesv[i], force.normalized())/dtim;	
-					float4 nodeFMinus = nodeF - (projectOnAxis(nodeV, normalize(force))/dtim);
+				float n_dot_v = dot(nrm, rel_v_nrm);

-					nodeF = nodeFPlusForce;
-					//if( dot(forceDTIM, forceDTIM) > dot(nodeV, nodeV) )
-					//	nodeF = nodeFMinus;
+				// drag force
+				if ( dragFactor > 0.f )
+					fDrag = 0.5f * dragFactor * mediumDensity * rel_v2 * area * n_dot_v * (-1.0f) * rel_v_nrm;

+				// lift force
+				// Check angle of attack
+				// cos(10<EFBFBD>) = 0.98480
+				if ( 0 < n_dot_v && n_dot_v < 0.98480f)
+					fLift = 0.5f * liftFactor * mediumDensity * rel_v_len * area * sqrt(1.0f-n_dot_v*n_dot_v) * (cross(cross(nrm, rel_v_nrm), rel_v_nrm));
+				
+				nodeF += fDrag + fLift;
 					g_vertexForceAccumulator[nodeID] = nodeF;	
-				}
 			}
 		}
 	}
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateFixedVertexPositions.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateFixedVertexPositions.cl
@@ -0,0 +1,25 @@
+MSTRINGIFY(
+	
+__kernel void 
+UpdateFixedVertexPositions(
+	const uint numNodes,
+	__global int * g_anchorIndex,
+	__global float4 * g_vertexPositions,
+	__global float4 * g_anchorPositions GUID_ARG)
+{
+	unsigned int nodeID = get_global_id(0);
+
+	if( nodeID < numNodes )
+	{		
+		int anchorIndex  = g_anchorIndex[nodeID];
+		float4 position = g_vertexPositions[nodeID];
+
+		if ( anchorIndex >= 0 )
+		{
+			float4 anchorPosition = g_anchorPositions[anchorIndex];
+			g_vertexPositions[nodeID] = anchorPosition;
+		}
+	}		
+}
+
+);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp
@@ -5,17 +5,10 @@
 #include "btSoftBodySolverVertexBuffer_OpenGL.h"
 #include "BulletSoftBody/btSoftBody.h"

-#if (0)//CL_VERSION_1_1 == 1)
- //OpenCL 1.1 kernels use float3
-#define MSTRINGIFY(A) #A
-static char* OutputToVertexArrayCLString =
-#include "OpenCLC/OutputToVertexArray.cl"
-#else
 ////OpenCL 1.0 kernels don't use float3
 #define MSTRINGIFY(A) #A
 static char* OutputToVertexArrayCLString =
 #include "OpenCLC10/OutputToVertexArray.cl"
-#endif //CL_VERSION_1_1

 	
 #define RELEASE_CL_KERNEL(kernelName) {if( kernelName ){ clReleaseKernel( kernelName ); kernelName = 0; }}
@@ -105,8 +98,8 @@ bool btSoftBodySolverOutputCLtoGL::buildShaders()
 	if( m_shadersInitialized )
 		return true;
 	
-	outputToVertexArrayWithNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithNormalsKernel" );
-	outputToVertexArrayWithoutNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithoutNormalsKernel" );
+	outputToVertexArrayWithNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithNormalsKernel" ,"","OpenCLC10/OutputToVertexArray.cl");
+	outputToVertexArrayWithoutNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithoutNormalsKernel" ,"","OpenCLC10/OutputToVertexArray.cl");


 	if( returnVal )
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
@@ -45,7 +45,7 @@ public:

 	virtual bool moveToAccelerator();

-	virtual bool moveFromAccelerator();
+	virtual bool moveFromAccelerator(bool bCopy = false, bool bCopyMinimum = true);
 };


--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
@@ -20,10 +20,13 @@ subject to the following restrictions:
 #include "btSoftBodySolver_OpenCL.h"
 #include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
 #include "BulletSoftBody/btSoftBody.h"
+#include "BulletSoftBody/btSoftBodyInternals.h"
 #include "BulletCollision/CollisionShapes/btCapsuleShape.h"
+#include "BulletCollision/COllisionShapes/btSphereShape.h"
 #include "LinearMath/btQuickprof.h"
 #include <limits.h>

+
 #define BT_SUPPRESS_OPENCL_ASSERTS

 #ifdef USE_MINICL
@@ -36,8 +39,7 @@ subject to the following restrictions:
 	#endif //__APPLE__
 #endif//USE_MINICL

-#define BT_DEFAULT_WORKGROUPSIZE 128
-
+#define BT_DEFAULT_WORKGROUPSIZE 64


 #define RELEASE_CL_KERNEL(kernelName) {if( kernelName ){ clReleaseKernel( kernelName ); kernelName = 0; }}
@@ -45,32 +47,6 @@ subject to the following restrictions:

 //CL_VERSION_1_1 seems broken on NVidia SDK so just disable it

-#if (0)//CL_VERSION_1_1 == 1)
- //OpenCL 1.1 kernels use float3
-#define MSTRINGIFY(A) #A
-static const char* PrepareLinksCLString = 
-#include "OpenCLC/PrepareLinks.cl"
-static const char* UpdatePositionsFromVelocitiesCLString = 
-#include "OpenCLC/UpdatePositionsFromVelocities.cl"
-static const char* SolvePositionsCLString = 
-#include "OpenCLC/SolvePositions.cl"
-static const char* UpdateNodesCLString = 
-#include "OpenCLC/UpdateNodes.cl"
-static const char* UpdatePositionsCLString = 
-#include "OpenCLC/UpdatePositions.cl"
-static const char* UpdateConstantsCLString = 
-#include "OpenCLC/UpdateConstants.cl"
-static const char* IntegrateCLString = 
-#include "OpenCLC/Integrate.cl"
-static const char* ApplyForcesCLString = 
-#include "OpenCLC/ApplyForces.cl"
-static const char* UpdateNormalsCLString = 
-#include "OpenCLC/UpdateNormals.cl"
-static const char* VSolveLinksCLString = 
-#include "OpenCLC/VSolveLinks.cl"
-static const char* SolveCollisionsAndUpdateVelocitiesCLString =
-#include "OpenCLC/SolveCollisionsAndUpdateVelocities.cl"
-#else
 ////OpenCL 1.0 kernels don't use float3
 #define MSTRINGIFY(A) #A
 static const char* PrepareLinksCLString = 
@@ -89,13 +65,14 @@ static const char* IntegrateCLString =
 #include "OpenCLC10/Integrate.cl"
 static const char* ApplyForcesCLString = 
 #include "OpenCLC10/ApplyForces.cl"
+static const char* UpdateFixedVertexPositionsCLString = 
+#include "OpenCLC10/UpdateFixedVertexPositions.cl"
 static const char* UpdateNormalsCLString = 
 #include "OpenCLC10/UpdateNormals.cl"
 static const char* VSolveLinksCLString = 
 #include "OpenCLC10/VSolveLinks.cl"
 static const char* SolveCollisionsAndUpdateVelocitiesCLString =
 #include "OpenCLC10/SolveCollisionsAndUpdateVelocities.cl"
-#endif //CL_VERSION_1_1


 btSoftBodyVertexDataOpenCL::btSoftBodyVertexDataOpenCL( cl_command_queue queue, cl_context ctx) :
@@ -141,18 +118,42 @@ bool btSoftBodyVertexDataOpenCL::moveToAccelerator()
 	return success;
 }

-bool btSoftBodyVertexDataOpenCL::moveFromAccelerator()
+bool btSoftBodyVertexDataOpenCL::moveFromAccelerator(bool bCopy, bool bCopyMinimum)
 {
 	bool success = true;
-	success = success && m_clClothIdentifier.moveFromGPU();
-	success = success && m_clVertexPosition.moveFromGPU();
-	success = success && m_clVertexPreviousPosition.moveFromGPU();
-	success = success && m_clVertexVelocity.moveFromGPU();
-	success = success && m_clVertexForceAccumulator.moveFromGPU();
-	success = success && m_clVertexNormal.moveFromGPU();
-	success = success && m_clVertexInverseMass.moveFromGPU();
-	success = success && m_clVertexArea.moveFromGPU();
-	success = success && m_clVertexTriangleCount.moveFromGPU();
+
+	if (!bCopy)
+	{
+		success = success && m_clClothIdentifier.moveFromGPU();
+		success = success && m_clVertexPosition.moveFromGPU();
+		success = success && m_clVertexPreviousPosition.moveFromGPU();
+		success = success && m_clVertexVelocity.moveFromGPU();
+		success = success && m_clVertexForceAccumulator.moveFromGPU();
+		success = success && m_clVertexNormal.moveFromGPU();
+		success = success && m_clVertexInverseMass.moveFromGPU();
+		success = success && m_clVertexArea.moveFromGPU();
+		success = success && m_clVertexTriangleCount.moveFromGPU();
+	}
+	else
+	{
+		if (bCopyMinimum)
+		{
+			success = success && m_clVertexPosition.copyFromGPU();
+			success = success && m_clVertexNormal.copyFromGPU();
+		}
+		else
+		{
+			success = success && m_clClothIdentifier.copyFromGPU();
+			success = success && m_clVertexPosition.copyFromGPU();
+			success = success && m_clVertexPreviousPosition.copyFromGPU();
+			success = success && m_clVertexVelocity.copyFromGPU();
+			success = success && m_clVertexForceAccumulator.copyFromGPU();
+			success = success && m_clVertexNormal.copyFromGPU();
+			success = success && m_clVertexInverseMass.copyFromGPU();
+			success = success && m_clVertexArea.copyFromGPU();
+			success = success && m_clVertexTriangleCount.copyFromGPU();
+		}
+	}

 	if( success )
 		m_onGPU = true;
@@ -160,9 +161,6 @@ bool btSoftBodyVertexDataOpenCL::moveFromAccelerator()
 	return success;
 }

-
-
-
 btSoftBodyLinkDataOpenCL::btSoftBodyLinkDataOpenCL(cl_command_queue queue,  cl_context ctx) 
 :m_cqCommandQue(queue),
 	m_clLinks( queue, ctx, &m_links, false ),
@@ -602,11 +600,12 @@ void btSoftBodyTriangleDataOpenCL::generateBatches()



-btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_context ctx) :
+btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_context ctx, bool bUpdateAchchoredNodePos) :
 	m_linkData(queue, ctx),
 	m_vertexData(queue, ctx),
 	m_triangleData(queue, ctx),
-	m_clFunctions(queue, ctx),
+	m_defaultCLFunctions(queue, ctx),
+	m_currentCLFunctions(&m_defaultCLFunctions),
 	m_clPerClothAcceleration(queue, ctx, &m_perClothAcceleration, true ),
 	m_clPerClothWindVelocity(queue, ctx, &m_perClothWindVelocity, true ),
 	m_clPerClothDampingFactor(queue,ctx, &m_perClothDampingFactor, true ),
@@ -617,10 +616,14 @@ btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_contex
 	m_clPerClothCollisionObjects( queue, ctx, &m_perClothCollisionObjects, true ),
 	m_clCollisionObjectDetails( queue, ctx, &m_collisionObjectDetails, true ),
 	m_clPerClothFriction( queue, ctx, &m_perClothFriction, false ),
+	m_clAnchorPosition( queue, ctx, &m_anchorPosition, true ),
+	m_clAnchorIndex( queue, ctx, &m_anchorIndex, true),
 	m_cqCommandQue( queue ),
 	m_cxMainContext(ctx),
-	m_defaultWorkGroupSize(BT_DEFAULT_WORKGROUPSIZE)
+	m_defaultWorkGroupSize(BT_DEFAULT_WORKGROUPSIZE),
+	m_bUpdateAnchoredNodePos(bUpdateAchchoredNodePos)
 {
+
 	// Initial we will clearly need to update solver constants
 	// For now this is global for the cloths linked with this solver - we should probably make this body specific 
 	// for performance in future once we understand more clearly when constants need to be updated
@@ -643,6 +646,7 @@ btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_contex
 	m_normalizeNormalsAndAreasKernel = 0;
 	m_outputToVertexArrayKernel = 0;
 	m_applyForcesKernel = 0;
+	m_updateFixedVertexPositionsKernel = 0;
 }

 btOpenCLSoftBodySolver::~btOpenCLSoftBodySolver()
@@ -666,14 +670,16 @@ void btOpenCLSoftBodySolver::releaseKernels()
 	RELEASE_CL_KERNEL( m_normalizeNormalsAndAreasKernel );
 	RELEASE_CL_KERNEL( m_outputToVertexArrayKernel );
 	RELEASE_CL_KERNEL( m_applyForcesKernel );
+	RELEASE_CL_KERNEL( m_updateFixedVertexPositionsKernel );

 	m_shadersInitialized = false;
 }

-void btOpenCLSoftBodySolver::copyBackToSoftBodies()
+void btOpenCLSoftBodySolver::copyBackToSoftBodies(bool bMove)
 {
+
 	// Move the vertex data back to the host first
-	m_vertexData.moveFromAccelerator();
+	m_vertexData.moveFromAccelerator(!bMove);

 	// Loop over soft bodies, copying all the vertex positions back for each body in turn
 	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
@@ -688,15 +694,16 @@ void btOpenCLSoftBodySolver::copyBackToSoftBodies()
 		for( int vertex = 0; vertex < numVertices; ++vertex )
 		{
 			using Vectormath::Aos::Point3;
-			Point3 vertexPosition( getVertexData().getVertexPositions()[firstVertex + vertex] );
+			Point3 vertexPosition( m_vertexData.getVertexPositions()[firstVertex + vertex] );
+			Point3 normal(m_vertexData.getNormal(firstVertex + vertex));

 			softBody->m_nodes[vertex].m_x.setX( vertexPosition.getX() );
 			softBody->m_nodes[vertex].m_x.setY( vertexPosition.getY() );
 			softBody->m_nodes[vertex].m_x.setZ( vertexPosition.getZ() );

-			softBody->m_nodes[vertex].m_n.setX( vertexPosition.getX() );
-			softBody->m_nodes[vertex].m_n.setY( vertexPosition.getY() );
-			softBody->m_nodes[vertex].m_n.setZ( vertexPosition.getZ() );
+			softBody->m_nodes[vertex].m_n.setX( normal.getX() );
+			softBody->m_nodes[vertex].m_n.setY( normal.getY() );
+			softBody->m_nodes[vertex].m_n.setZ( normal.getZ() );
 		}
 	}	
 } // btOpenCLSoftBodySolver::copyBackToSoftBodies
@@ -710,7 +717,10 @@ void btOpenCLSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &sof
 		getTriangleData().clear();
 		getLinkData().clear();
 		m_softBodySet.resize(0);
+		m_anchorIndex.clear();

+		int maxPiterations = 0;
+		int maxViterations = 0;

 		for( int softBodyIndex = 0; softBodyIndex < softBodies.size(); ++softBodyIndex )
 		{
@@ -759,6 +769,8 @@ void btOpenCLSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &sof
 				float vertexInverseMass = softBody->m_nodes[vertex].m_im;
 				desc.setInverseMass(vertexInverseMass);
 				getVertexData().setVertexAt( desc, firstVertex + vertex );
+
+				m_anchorIndex.push_back(-1.0);
 			}

 			// Copy triangles similarly
@@ -805,13 +817,76 @@ void btOpenCLSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &sof
 			newSoftBody->setMaxTriangles( maxTriangles );
 			newSoftBody->setFirstLink( firstLink );
 			newSoftBody->setNumLinks( numLinks );
+
+			// Find maximum piterations and viterations
+			int piterations = softBody->m_cfg.piterations;
+
+            if ( piterations > maxPiterations )
+                  maxPiterations = piterations;
+
+            int viterations = softBody->m_cfg.viterations;
+
+			if ( viterations > maxViterations )
+                  maxViterations = viterations;
+
+			// zero mass
+			for( int vertex = 0; vertex < numVertices; ++vertex )
+			{
+				if ( softBody->m_nodes[vertex].m_im == 0 )
+				{
+					AnchorNodeInfoCL nodeInfo;
+					nodeInfo.clVertexIndex = firstVertex + vertex;
+					nodeInfo.pNode = &softBody->m_nodes[vertex];
+
+					m_anchorNodeInfoArray.push_back(nodeInfo);
+				}
+			}			
+
+			// anchor position
+			if ( numVertices > 0 )
+			{
+				for ( int anchorIndex = 0; anchorIndex < softBody->m_anchors.size(); anchorIndex++ )
+				{
+					btSoftBody::Node* anchorNode = softBody->m_anchors[anchorIndex].m_node;
+					btSoftBody::Node* firstNode = &softBody->m_nodes[0];
+
+					AnchorNodeInfoCL nodeInfo;
+					nodeInfo.clVertexIndex = firstVertex + (int)(anchorNode - firstNode);
+					nodeInfo.pNode = anchorNode;
+
+					m_anchorNodeInfoArray.push_back(nodeInfo);
+				}
+			}			
 		}

 		
+		m_anchorPosition.clear();		
+		m_anchorPosition.resize(m_anchorNodeInfoArray.size());
+
+		for ( int anchorNode = 0; anchorNode < m_anchorNodeInfoArray.size(); anchorNode++ )
+		{
+			const AnchorNodeInfoCL& anchorNodeInfo = m_anchorNodeInfoArray[anchorNode];
+			m_anchorIndex[anchorNodeInfo.clVertexIndex] = anchorNode;
+			getVertexData().getInverseMass(anchorNodeInfo.clVertexIndex) = 0.0f;
+		}
 		
 		updateConstants(0.f);

+		// set position and velocity iterations
+		setNumberOfPositionIterations(maxPiterations);
+		setNumberOfVelocityIterations(maxViterations);

+		// set wind velocity
+		m_perClothWindVelocity.resize( m_softBodySet.size() );
+		for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
+		{
+			btSoftBody *softBody = m_softBodySet[softBodyIndex]->getSoftBody();			
+			m_perClothWindVelocity[softBodyIndex] = toVector3(softBody->getWindVelocity());
+		}
+
+		m_clPerClothWindVelocity.changedOnCPU();
+
+		// generate batches
 		m_linkData.generateBatches();		
 		m_triangleData.generateBatches();

@@ -861,7 +936,6 @@ void btOpenCLSoftBodySolver::resetNormalsAndAreas( int numVertices )

 void btOpenCLSoftBodySolver::normalizeNormalsAndAreas( int numVertices )
 {
-
 	cl_int ciErrNum;

 	ciErrNum = clSetKernelArg(m_normalizeNormalsAndAreasKernel, 0, sizeof(int),(void*) &numVertices);
@@ -882,7 +956,6 @@ void btOpenCLSoftBodySolver::normalizeNormalsAndAreas( int numVertices )

 void btOpenCLSoftBodySolver::executeUpdateSoftBodies( int firstTriangle, int numTriangles )
 {
-
 	cl_int ciErrNum;
 	ciErrNum = clSetKernelArg(m_updateSoftBodiesKernel, 0, sizeof(int), (void*) &firstTriangle);
 	ciErrNum = clSetKernelArg(m_updateSoftBodiesKernel, 1, sizeof(int), &numTriangles);
@@ -948,9 +1021,34 @@ void btOpenCLSoftBodySolver::ApplyClampedForce( float solverdt, const Vectormath
 	}
 }

+void btOpenCLSoftBodySolver::updateFixedVertexPositions()
+{	
+	// Ensure data is on accelerator
+	m_vertexData.moveToAccelerator();
+	m_clAnchorPosition.moveToGPU();
+	m_clAnchorIndex.moveToGPU();
+
+	cl_int ciErrNum ;
+	int numVerts = m_vertexData.getNumVertices();
+	ciErrNum = clSetKernelArg(m_updateFixedVertexPositionsKernel, 0, sizeof(int), &numVerts);
+	ciErrNum = clSetKernelArg(m_updateFixedVertexPositionsKernel,1, sizeof(cl_mem), &m_clAnchorIndex.m_buffer);
+	ciErrNum = clSetKernelArg(m_updateFixedVertexPositionsKernel,2, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
+	ciErrNum = clSetKernelArg(m_updateFixedVertexPositionsKernel,3, sizeof(cl_mem), &m_clAnchorPosition.m_buffer);
+
+	size_t numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
+	if (numWorkItems)
+	{
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_updateFixedVertexPositionsKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize, 0,0,0);
+		if( ciErrNum != CL_SUCCESS ) 
+		{
+			btAssert( 0 &&  "enqueueNDRangeKernel(m_updateFixedVertexPositionsKernel)");
+		}
+	}
+
+}
+
 void btOpenCLSoftBodySolver::applyForces( float solverdt )
 {	
-
 	// Ensure data is on accelerator
 	m_vertexData.moveToAccelerator();
 	m_clPerClothAcceleration.moveToGPU();
@@ -976,6 +1074,7 @@ void btOpenCLSoftBodySolver::applyForces( float solverdt )
 	ciErrNum = clSetKernelArg(m_applyForcesKernel,11, sizeof(cl_mem), &m_clPerClothMediumDensity.m_buffer);
 	ciErrNum = clSetKernelArg(m_applyForcesKernel,12, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);
 	ciErrNum = clSetKernelArg(m_applyForcesKernel,13, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
+
 	size_t numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
 	if (numWorkItems)
 	{
@@ -993,8 +1092,6 @@ void btOpenCLSoftBodySolver::applyForces( float solverdt )
 */
 void btOpenCLSoftBodySolver::integrate( float solverdt )
 {
-	
-
 	// Ensure data is on accelerator
 	m_vertexData.moveToAccelerator();

@@ -1186,7 +1283,7 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 		updateVelocitiesFromPositionsWithoutVelocities( 1.f/solverdt );
 	}

-	// Solve drift
+	// Solve position
 	for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
 	{
 		for( int i = 0; i < m_linkData.m_batchStartLengths.size(); ++i )
@@ -1210,7 +1307,6 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 // Kernel dispatches
 void btOpenCLSoftBodySolver::prepareLinks()
 {
-
 	cl_int ciErrNum;
 	int numLinks = m_linkData.getNumLinks();
 	ciErrNum = clSetKernelArg(m_prepareLinksKernel,0, sizeof(int), &numLinks);
@@ -1231,7 +1327,6 @@ void btOpenCLSoftBodySolver::prepareLinks()

 void btOpenCLSoftBodySolver::updatePositionsFromVelocities( float solverdt )
 {
-
 	cl_int ciErrNum;
 	int numVerts = m_vertexData.getNumVertices();
 	ciErrNum = clSetKernelArg(m_updatePositionsFromVelocitiesKernel,0, sizeof(int), &numVerts);
@@ -1251,7 +1346,6 @@ void btOpenCLSoftBodySolver::updatePositionsFromVelocities( float solverdt )

 void btOpenCLSoftBodySolver::solveLinksForPosition( int startLink, int numLinks, float kst, float ti )
 {
-
 	cl_int ciErrNum;
 	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,0, sizeof(int), &startLink);
 	ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,1, sizeof(int), &numLinks);
@@ -1275,7 +1369,6 @@ void btOpenCLSoftBodySolver::solveLinksForPosition( int startLink, int numLinks,

 void btOpenCLSoftBodySolver::solveLinksForVelocity( int startLink, int numLinks, float kst )
 {
-
 	cl_int ciErrNum;
 	ciErrNum = clSetKernelArg(m_vSolveLinksKernel, 0, sizeof(int), &startLink);
 	ciErrNum = clSetKernelArg(m_vSolveLinksKernel, 1, sizeof(int), &numLinks);
@@ -1297,7 +1390,6 @@ void btOpenCLSoftBodySolver::solveLinksForVelocity( int startLink, int numLinks,

 void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithVelocities( float isolverdt )
 {
-
 	cl_int ciErrNum;
 	int numVerts = m_vertexData.getNumVertices();
 	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithVelocitiesKernel,0, sizeof(int), &numVerts);
@@ -1322,7 +1414,6 @@ void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithVelocities( float

 void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities( float isolverdt )
 {
-
 	cl_int ciErrNum;
 	int numVerts = m_vertexData.getNumVertices();
 	ciErrNum = clSetKernelArg(m_updateVelocitiesFromPositionsWithoutVelocitiesKernel, 0, sizeof(int), &numVerts);
@@ -1347,7 +1438,6 @@ void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities( flo

 void btOpenCLSoftBodySolver::solveCollisionsAndUpdateVelocities( float isolverdt )
 {
-
 	// Copy kernel parameters to GPU
 	m_vertexData.moveToAccelerator();
 	m_clPerClothFriction.moveToGPU();
@@ -1355,7 +1445,6 @@ void btOpenCLSoftBodySolver::solveCollisionsAndUpdateVelocities( float isolverdt
 	m_clPerClothCollisionObjects.moveToGPU();
 	m_clCollisionObjectDetails.moveToGPU();

-
 	cl_int ciErrNum;
 	int numVerts = m_vertexData.getNumVertices();
 	ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 0, sizeof(int), &numVerts);
@@ -1446,7 +1535,7 @@ void btSoftBodySolverOutputCLtoCPU::copySoftBodyToVertexBuffer( const btSoftBody



-cl_kernel CLFunctions::compileCLKernelFromString( const char* kernelSource, const char* kernelName, const char* additionalMacros )
+cl_kernel CLFunctions::compileCLKernelFromString( const char* kernelSource, const char* kernelName, const char* additionalMacros ,const char* orgSrcFileNameForCaching)
 {
 	printf("compiling kernelName: %s ",kernelName);
 	cl_kernel kernel=0;
@@ -1554,20 +1643,25 @@ void btOpenCLSoftBodySolver::predictMotion( float timeStep )
 	m_clPerClothCollisionObjects.changedOnCPU();
 	m_collisionObjectDetails.clear();	

+	if ( m_bUpdateAnchoredNodePos )
 	{
-		BT_PROFILE("perClothWindVelocity");
-		// Fill the force arrays with current acceleration data etc
-		m_perClothWindVelocity.resize( m_softBodySet.size() );
-		for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
+		// In OpenCL cloth solver, if softbody node has zero inverse mass(infinite mass) or anchor attached, 
+		// we need to update the node position in case the node or anchor is animated externally.
+		// If there is no such node, we can eliminate the unnecessary CPU-to-GPU data trasferring. 
+		for ( int i = 0; i < m_anchorNodeInfoArray.size(); i++ )
 		{
-			btSoftBody *softBody = m_softBodySet[softBodyIndex]->getSoftBody();
+			const AnchorNodeInfoCL& anchorNodeInfo = m_anchorNodeInfoArray[i];
+			btSoftBody::Node* node = anchorNodeInfo.pNode;

-			m_perClothWindVelocity[softBodyIndex] = toVector3(softBody->getWindVelocity());
+			using Vectormath::Aos::Point3;
+			Point3 pos((float)node->m_x.getX(), (float)node->m_x.getY(), (float)node->m_x.getZ());				
+			m_anchorPosition[i] = pos;
 		}
-	}
-	{
-		BT_PROFILE("changedOnCPU");
-		m_clPerClothWindVelocity.changedOnCPU();
+
+		if ( m_anchorNodeInfoArray.size() > 0 )
+			m_clAnchorPosition.changedOnCPU();
+
+		updateFixedVertexPositions();
 	}
 		
 	{
@@ -1601,7 +1695,7 @@ static Vectormath::Aos::Transform3 toTransform3( const btTransform &transform )

 void btOpenCLAcceleratedSoftBodyInterface::updateBounds( const btVector3 &lowerBound, const btVector3 &upperBound )
 {
-	float scalarMargin = this->getSoftBody()->getCollisionShape()->getMargin();
+	float scalarMargin = (float)getSoftBody()->getCollisionShape()->getMargin();
 	btVector3 vectorMargin( scalarMargin, scalarMargin, scalarMargin );
 	m_softBody->m_bounds[0] = lowerBound - vectorMargin;
 	m_softBody->m_bounds[1] = upperBound + vectorMargin;
@@ -1641,7 +1735,8 @@ void btOpenCLSoftBodySolver::processCollision( btSoftBody *softBody, btCollision
 			newCollisionShapeDescription.angularVelocity = toVector3(body->getAngularVelocity());
 			m_collisionObjectDetails.push_back( newCollisionShapeDescription );

-		} else {
+		} 		
+		else {
 			printf("Unsupported collision shape type\n");
 			//btAssert(0 && "Unsupported collision shape type\n");
 		}
@@ -1688,32 +1783,36 @@ bool btOpenCLSoftBodySolver::checkInitialized()

 bool btOpenCLSoftBodySolver::buildShaders()
 {
-	// Ensure current kernels are released first
-	releaseKernels();
-	
 	if( m_shadersInitialized )
 		return true;

-	m_clFunctions.clearKernelCompilationFailures();
+	const char* additionalMacros="";

-	m_prepareLinksKernel = m_clFunctions.compileCLKernelFromString( PrepareLinksCLString, "PrepareLinksKernel" );
-	m_updatePositionsFromVelocitiesKernel = m_clFunctions.compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel" );
-	m_solvePositionsFromLinksKernel = m_clFunctions.compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel" );
-	m_vSolveLinksKernel = m_clFunctions.compileCLKernelFromString( VSolveLinksCLString, "VSolveLinksKernel" );
-	m_updateVelocitiesFromPositionsWithVelocitiesKernel = m_clFunctions.compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel" );
-	m_updateVelocitiesFromPositionsWithoutVelocitiesKernel = m_clFunctions.compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel" );
-	m_solveCollisionsAndUpdateVelocitiesKernel = m_clFunctions.compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel" );
-	m_integrateKernel = m_clFunctions.compileCLKernelFromString( IntegrateCLString, "IntegrateKernel" );
-	m_applyForcesKernel = m_clFunctions.compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel" );
+	// Ensure current kernels are released first
+	releaseKernels();
+	
+	m_currentCLFunctions->clearKernelCompilationFailures();
+
+	m_prepareLinksKernel = m_currentCLFunctions->compileCLKernelFromString( PrepareLinksCLString, "PrepareLinksKernel",additionalMacros,"OpenCLC10/PrepareLinks.cl" );
+	m_updatePositionsFromVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel" ,additionalMacros,"OpenCLC10/UpdatePositionsFromVelocities.cl");
+	m_solvePositionsFromLinksKernel = m_currentCLFunctions->compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel",additionalMacros,"OpenCLC10/SolvePositions.cl" );
+	m_vSolveLinksKernel = m_currentCLFunctions->compileCLKernelFromString( VSolveLinksCLString, "VSolveLinksKernel" ,additionalMacros,"OpenCLC10/VSolveLinks.cl");
+	m_updateVelocitiesFromPositionsWithVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel" ,additionalMacros,"OpenCLC10/UpdateNodes.cl");
+	m_updateVelocitiesFromPositionsWithoutVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel" ,additionalMacros,"OpenCLC10/UpdatePositions.cl");
+	m_solveCollisionsAndUpdateVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel" ,additionalMacros,"OpenCLC10/SolveCollisionsAndUpdateVelocities.cl");
+	m_integrateKernel = m_currentCLFunctions->compileCLKernelFromString( IntegrateCLString, "IntegrateKernel" ,additionalMacros,"OpenCLC10/Integrate.cl");
+	m_applyForcesKernel = m_currentCLFunctions->compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel" ,additionalMacros,"OpenCLC10/ApplyForces.cl");
+	m_updateFixedVertexPositionsKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateFixedVertexPositionsCLString, "UpdateFixedVertexPositions" , additionalMacros, "OpenCLC10/UpdateFixedVertexPositions.cl");

 	// TODO: Rename to UpdateSoftBodies
-	m_resetNormalsAndAreasKernel = m_clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel" );
-	m_normalizeNormalsAndAreasKernel = m_clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel" );
-	m_updateSoftBodiesKernel = m_clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel" );
+	m_resetNormalsAndAreasKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel" ,additionalMacros,"OpenCLC10/UpdateNormals.cl");
+	m_normalizeNormalsAndAreasKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel" ,additionalMacros,"OpenCLC10/UpdateNormals.cl");
+	m_updateSoftBodiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel" ,additionalMacros,"OpenCLC10/UpdateNormals.cl");


-	if( m_clFunctions.getKernelCompilationFailures()==0 )
+	if( m_currentCLFunctions->getKernelCompilationFailures()==0 )
 		m_shadersInitialized = true;

 	return m_shadersInitialized;
 }
+
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
@@ -20,6 +20,7 @@ subject to the following restrictions:
 #include "vectormath/vmInclude.h"

 #include "BulletSoftBody/btSoftBodySolvers.h"
+#include "BulletSoftBody/btSoftBody.h"
 #include "btSoftBodySolverBuffer_OpenCL.h"
 #include "btSoftBodySolverLinkData_OpenCL.h"
 #include "btSoftBodySolverVertexData_OpenCL.h"
@@ -33,6 +34,7 @@ protected:

 	int	m_kernelCompilationFailures;

+
 public:
 	CLFunctions(cl_command_queue cqCommandQue, cl_context cxMainContext) :
 		m_cqCommandQue( cqCommandQue ),
@@ -49,7 +51,7 @@ public:
 	/**
 	 * Compile a compute shader kernel from a string and return the appropriate cl_kernel object.
 	 */	
-	cl_kernel compileCLKernelFromString( const char* kernelSource, const char* kernelName, const char* additionalMacros = "" );
+	virtual cl_kernel compileCLKernelFromString( const char* kernelSource, const char* kernelName, const char* additionalMacros, const char* srcFileNameForCaching);

 	void	clearKernelCompilationFailures()
 	{
@@ -285,7 +287,8 @@ public:

 protected:

-	CLFunctions m_clFunctions;
+	CLFunctions m_defaultCLFunctions;
+	CLFunctions* m_currentCLFunctions;

 	/** Variable to define whether we need to update solver constants on the next iteration */
 	bool m_updateSolverConstants;
@@ -350,7 +353,20 @@ protected:
 	btAlignedObjectArray< float >	m_perClothFriction;
 	btOpenCLBuffer< float >			m_clPerClothFriction;

+	// anchor node info
+	struct AnchorNodeInfoCL
+	{
+		int clVertexIndex;
+		btSoftBody::Node* pNode;
+	};

+	btAlignedObjectArray<AnchorNodeInfoCL> m_anchorNodeInfoArray;
+	btAlignedObjectArray<Vectormath::Aos::Point3> m_anchorPosition;
+	btOpenCLBuffer<Vectormath::Aos::Point3>		  m_clAnchorPosition;
+	btAlignedObjectArray<int> m_anchorIndex;
+	btOpenCLBuffer<int>		  m_clAnchorIndex;
+
+	bool m_bUpdateAnchoredNodePos;

 	cl_kernel		m_prepareLinksKernel;
 	cl_kernel		m_solvePositionsFromLinksKernel;
@@ -368,6 +384,7 @@ protected:

 	cl_kernel		m_outputToVertexArrayKernel;
 	cl_kernel		m_applyForcesKernel;
+	cl_kernel       m_updateFixedVertexPositionsKernel;	

 	cl_command_queue	m_cqCommandQue;
 	cl_context			m_cxMainContext;
@@ -394,6 +411,8 @@ protected:

 	virtual void applyForces( float solverdt );

+	void updateFixedVertexPositions();
+
 	/**
 	 * Integrate motion on the solver.
 	 */
@@ -430,7 +449,7 @@ protected:
 	void releaseKernels();

 public:
-	btOpenCLSoftBodySolver(cl_command_queue queue,cl_context	ctx);
+	btOpenCLSoftBodySolver(cl_command_queue queue,cl_context	ctx, bool bUpdateAchchoredNodePos = false);

 	virtual ~btOpenCLSoftBodySolver();

@@ -456,7 +475,7 @@ public:

 	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);

-	virtual void copyBackToSoftBodies();
+	virtual void copyBackToSoftBodies(bool bMove = true);

 	virtual void solveConstraints( float solverdt );

@@ -475,6 +494,14 @@ public:
 		return m_defaultWorkGroupSize;
 	}

+	void	setCLFunctions(CLFunctions* funcs)
+	{
+		if (funcs)
+			m_currentCLFunctions = funcs;
+		else
+			m_currentCLFunctions  = &m_defaultCLFunctions;
+	}
+
 }; // btOpenCLSoftBodySolver


--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp
@@ -33,32 +33,6 @@ static const size_t workGroupSize = GROUP_SIZE;

 //CL_VERSION_1_1 seems broken on NVidia SDK so just disable it

-#if (0)//CL_VERSION_1_1 == 1)
- //OpenCL 1.1 kernels use float3
-#define MSTRINGIFY(A) #A
-static const char* UpdatePositionsFromVelocitiesCLString = 
-#include "OpenCLC/UpdatePositionsFromVelocities.cl"
-static const char* SolvePositionsCLString = 
-#include "OpenCLC/SolvePositionsSIMDBatched.cl"
-static const char* UpdateNodesCLString = 
-#include "OpenCLC/UpdateNodes.cl"
-static const char* UpdatePositionsCLString = 
-#include "OpenCLC/UpdatePositions.cl"
-static const char* UpdateConstantsCLString = 
-#include "OpenCLC/UpdateConstants.cl"
-static const char* IntegrateCLString = 
-#include "OpenCLC/Integrate.cl"
-static const char* ApplyForcesCLString = 
-#include "OpenCLC/ApplyForces.cl"
-static const char* UpdateNormalsCLString = 
-#include "OpenCLC/UpdateNormals.cl"
-static const char* VSolveLinksCLString = 
-#include "OpenCLC/VSolveLinks.cl"
-static const char* SolveCollisionsAndUpdateVelocitiesCLString =
-#include "OpenCLC/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl"
-static const char* OutputToVertexArrayCLString =
-#include "OpenCLC/OutputToVertexArray.cl"
-#else
 ////OpenCL 1.0 kernels don't use float3
 #define MSTRINGIFY(A) #A
 static const char* UpdatePositionsFromVelocitiesCLString = 
@@ -75,6 +49,8 @@ static const char* IntegrateCLString =
 #include "OpenCLC10/Integrate.cl"
 static const char* ApplyForcesCLString = 
 #include "OpenCLC10/ApplyForces.cl"
+static const char* UpdateFixedVertexPositionsCLString = 
+#include "OpenCLC10/UpdateFixedVertexPositions.cl"
 static const char* UpdateNormalsCLString = 
 #include "OpenCLC10/UpdateNormals.cl"
 static const char* VSolveLinksCLString = 
@@ -83,7 +59,6 @@ static const char* SolveCollisionsAndUpdateVelocitiesCLString =
 #include "OpenCLC10/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl"
 static const char* OutputToVertexArrayCLString =
 #include "OpenCLC10/OutputToVertexArray.cl"
-#endif //CL_VERSION_1_1



@@ -194,8 +169,8 @@ bool btSoftBodyLinkDataOpenCLSIMDAware::moveFromAccelerator()



-btOpenCLSoftBodySolverSIMDAware::btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue, cl_context ctx) :
-	btOpenCLSoftBodySolver( queue, ctx ),
+btOpenCLSoftBodySolverSIMDAware::btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue, cl_context ctx, bool bUpdateAchchoredNodePos) :
+	btOpenCLSoftBodySolver( queue, ctx, bUpdateAchchoredNodePos ),
 	m_linkData(queue, ctx)
 {
 	// Initial we will clearly need to update solver constants
@@ -213,14 +188,17 @@ btOpenCLSoftBodySolverSIMDAware::~btOpenCLSoftBodySolverSIMDAware()

 void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody * > &softBodies ,bool forceUpdate)
 {
-	if( forceUpdate|| m_softBodySet.size() != softBodies.size() )
+	if( forceUpdate || m_softBodySet.size() != softBodies.size() )
 	{
 		// Have a change in the soft body set so update, reloading all the data
 		getVertexData().clear();
 		getTriangleData().clear();
 		getLinkData().clear();
 		m_softBodySet.resize(0);
+		m_anchorIndex.clear();

+		int maxPiterations = 0;
+		int maxViterations = 0;

 		for( int softBodyIndex = 0; softBodyIndex < softBodies.size(); ++softBodyIndex )
 		{
@@ -238,8 +216,7 @@ void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody
 			m_perClothLiftFactor.push_back( softBody->m_cfg.kLF );
 			m_perClothDragFactor.push_back( softBody->m_cfg.kDG );
 			m_perClothMediumDensity.push_back(softBody->getWorldInfo()->air_density);
-
-
+			// Simple init values. Actually we'll put 0 and -1 into them at the appropriate time
 			m_perClothFriction.push_back( softBody->getFriction() );
 			m_perClothCollisionObjects.push_back( CollisionObjectIndices(-1, -1) );

@@ -253,6 +230,7 @@ void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody
 			// Allocate space for new vertices in all the vertex arrays
 			getVertexData().createVertices( numVertices, softBodyIndex, maxVertices );

+
 			int firstTriangle = getTriangleData().getNumTriangles();
 			int numTriangles = softBody->m_faces.size();
 			int maxTriangles = numTriangles;
@@ -272,6 +250,8 @@ void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody
 				float vertexInverseMass = softBody->m_nodes[vertex].m_im;
 				desc.setInverseMass(vertexInverseMass);
 				getVertexData().setVertexAt( desc, firstVertex + vertex );
+
+				m_anchorIndex.push_back(-1.0);
 			}

 			// Copy triangles similarly
@@ -318,17 +298,78 @@ void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody
 			newSoftBody->setMaxTriangles( maxTriangles );
 			newSoftBody->setFirstLink( firstLink );
 			newSoftBody->setNumLinks( numLinks );
+
+			// Find maximum piterations and viterations
+			int piterations = softBody->m_cfg.piterations;
+
+            if ( piterations > maxPiterations )
+                  maxPiterations = piterations;
+
+            int viterations = softBody->m_cfg.viterations;
+
+			if ( viterations > maxViterations )
+                  maxViterations = viterations;
+
+			// zero mass
+			for( int vertex = 0; vertex < numVertices; ++vertex )
+			{
+				if ( softBody->m_nodes[vertex].m_im == 0 )
+				{
+					AnchorNodeInfoCL nodeInfo;
+					nodeInfo.clVertexIndex = firstVertex + vertex;
+					nodeInfo.pNode = &softBody->m_nodes[vertex];
+
+					m_anchorNodeInfoArray.push_back(nodeInfo);
+				}
+			}			
+
+			// anchor position
+			if ( numVertices > 0 )
+			{
+				for ( int anchorIndex = 0; anchorIndex < softBody->m_anchors.size(); anchorIndex++ )
+				{
+					btSoftBody::Node* anchorNode = softBody->m_anchors[anchorIndex].m_node;
+					btSoftBody::Node* firstNode = &softBody->m_nodes[0];
+
+					AnchorNodeInfoCL nodeInfo;
+					nodeInfo.clVertexIndex = firstVertex + (int)(anchorNode - firstNode);
+					nodeInfo.pNode = anchorNode;
+
+					m_anchorNodeInfoArray.push_back(nodeInfo);
+				}
+			}			
 		}

+		m_anchorPosition.clear();		
+		m_anchorPosition.resize(m_anchorNodeInfoArray.size());

+		for ( int anchorNode = 0; anchorNode < m_anchorNodeInfoArray.size(); anchorNode++ )
+		{
+			const AnchorNodeInfoCL& anchorNodeInfo = m_anchorNodeInfoArray[anchorNode];
+			m_anchorIndex[anchorNodeInfo.clVertexIndex] = anchorNode;
+			getVertexData().getInverseMass(anchorNodeInfo.clVertexIndex) = 0.0f;
+		}
 		
 		updateConstants(0.f);

+		// set position and velocity iterations
+		setNumberOfPositionIterations(maxPiterations);
+		setNumberOfVelocityIterations(maxViterations);

+		// set wind velocity
+		m_perClothWindVelocity.resize( m_softBodySet.size() );
+		for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
+		{
+			btSoftBody *softBody = m_softBodySet[softBodyIndex]->getSoftBody();			
+			m_perClothWindVelocity[softBodyIndex] = toVector3(softBody->getWindVelocity());
+		}
+
+		m_clPerClothWindVelocity.changedOnCPU();
+
+		// generate batches
 		m_linkData.generateBatches();		
 		m_triangleData.generateBatches();

-		
 		// Build the shaders to match the batching parameters
 		buildShaders();
 	}
@@ -509,7 +550,9 @@ bool btOpenCLSoftBodySolverSIMDAware::buildShaders()
 	if( m_shadersInitialized )
 		return true;

-	m_clFunctions.clearKernelCompilationFailures();
+	const char* additionalMacros="";
+
+	m_currentCLFunctions->clearKernelCompilationFailures();

 	char *wavefrontMacros = new char[256];

@@ -522,22 +565,23 @@ bool btOpenCLSoftBodySolverSIMDAware::buildShaders()
 		WAVEFRONT_BLOCK_MULTIPLIER,
 		WAVEFRONT_BLOCK_MULTIPLIER*m_linkData.getWavefrontSize());
 	
-	m_updatePositionsFromVelocitiesKernel = m_clFunctions.compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel", "" );
-	m_solvePositionsFromLinksKernel = m_clFunctions.compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel", wavefrontMacros );
-	m_updateVelocitiesFromPositionsWithVelocitiesKernel = m_clFunctions.compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel", "" );
-	m_updateVelocitiesFromPositionsWithoutVelocitiesKernel = m_clFunctions.compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel", "" );
-	m_integrateKernel = m_clFunctions.compileCLKernelFromString( IntegrateCLString, "IntegrateKernel", "" );
-	m_applyForcesKernel = m_clFunctions.compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel", "" );
-	m_solveCollisionsAndUpdateVelocitiesKernel = m_clFunctions.compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel", "" );
+	m_updatePositionsFromVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel", additionalMacros,"OpenCLC10/UpdatePositionsFromVelocities.cl");
+	m_solvePositionsFromLinksKernel = m_currentCLFunctions->compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel", wavefrontMacros ,"OpenCLC10/SolvePositionsSIMDBatched.cl");
+	m_updateVelocitiesFromPositionsWithVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel", additionalMacros ,"OpenCLC10/UpdateNodes.cl");
+	m_updateVelocitiesFromPositionsWithoutVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel", additionalMacros,"OpenCLC10/UpdatePositions.cl");
+	m_integrateKernel = m_currentCLFunctions->compileCLKernelFromString( IntegrateCLString, "IntegrateKernel", additionalMacros ,"OpenCLC10/Integrate.cl");
+	m_applyForcesKernel = m_currentCLFunctions->compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel", additionalMacros,"OpenCLC10/ApplyForces.cl" );
+	m_updateFixedVertexPositionsKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateFixedVertexPositionsCLString, "UpdateFixedVertexPositions" ,additionalMacros,"OpenCLC10/UpdateFixedVertexPositions.cl");
+	m_solveCollisionsAndUpdateVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel", additionalMacros ,"OpenCLC10/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl");

 	// TODO: Rename to UpdateSoftBodies
-	m_resetNormalsAndAreasKernel = m_clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel", "" );
-	m_normalizeNormalsAndAreasKernel = m_clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel", "" );
-	m_updateSoftBodiesKernel = m_clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel", "" );
+	m_resetNormalsAndAreasKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel", additionalMacros ,"OpenCLC10/UpdateNormals.cl");
+	m_normalizeNormalsAndAreasKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel", additionalMacros ,"OpenCLC10/UpdateNormals.cl");
+	m_updateSoftBodiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel", additionalMacros ,"OpenCLC10/UpdateNormals.cl");

 	delete [] wavefrontMacros;

-	if( m_clFunctions.getKernelCompilationFailures()==0)
+	if( m_currentCLFunctions->getKernelCompilationFailures()==0)
 	{
 		m_shadersInitialized = true;
 	}
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h
@@ -39,7 +39,7 @@ protected:



-	bool buildShaders();
+	virtual bool buildShaders();


 	void updateConstants( float timeStep );
@@ -59,7 +59,7 @@ protected:
 	/////////////////////////////////////

 public:
-	btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue,cl_context	ctx);
+	btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue,cl_context	ctx, bool bUpdateAchchoredNodePos = false);

 	virtual ~btOpenCLSoftBodySolverSIMDAware();

--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/Shared/btSoftBodySolverData.h
@@ -561,10 +561,14 @@ public:
 	}

 	/**
-	 * Move data from host memory from the accelerator.
+	 * Move data to host memory from the accelerator if bCopy is false.
+	 * If bCopy is true, copy data to host memory from the accelerator so that data 
+	 * won't be moved to accelerator when moveToAccelerator() is called next time. 
+	 * If bCopyMinimum is true, only vertex position and normal are copied.
+	 * bCopyMinimum will be meaningful only if bCopy is true.
 	 * The CPU version will always return that it has moved it.
 	 */
-	virtual bool moveFromAccelerator()
+	virtual bool moveFromAccelerator(bool bCopy = false, bool bCopyMinimum = true)
 	{
 		return true;
 	}