added OpenCL cloth demo, contributed by AMD.

updated GpuSoftBodySolvers updated DirectCompute cloth demo
2010-08-14 00:56:17 +00:00
parent 40958f2b4a
commit 4f9b450200
72 changed files with 7524 additions and 843 deletions
--- a/src/BulletMultiThreaded/CMakeLists.txt
+++ b/src/BulletMultiThreaded/CMakeLists.txt
@@ -67,10 +67,8 @@ ADD_LIBRARY(BulletMultiThreaded

 )

-#for now, only Direct 11 (Direct Compute)
-IF(USE_DX11)
-	SUBDIRS(GpuSoftBodySolvers)
-ENDIF(USE_DX11)
+
+SUBDIRS(GpuSoftBodySolvers)


 IF (BUILD_SHARED_LIBS)
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/CMakeLists.txt
@@ -3,20 +3,12 @@ INCLUDE_DIRECTORIES(
 ${BULLET_PHYSICS_SOURCE_DIR}/src
 )

-LIST(APPEND SubDirList "CPU")

+SUBDIRS ( 
+	OpenCL
+	CPU 
+)

-# Configure use of OpenCL and DX11
-# Generates the settings file and defines libraries and include paths
-OPTION(USE_OPENCL "Use OpenCL"	OFF)
-
-
-
-if( USE_OPENCL )
-	LIST(APPEND SubDirList "OpenCL")
-endif( USE_OPENCL )
-if( USE_DX11 )
-	LIST(APPEND SubDirList "DX11")
-endif( USE_DX11 )
-
-SUBDIRS( ${SubDirList} )
+IF( USE_DX11 )
+	SUBDIRS( DX11 )
+ENDIF( USE_DX11 )
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/CMakeLists.txt
@@ -14,14 +14,17 @@ ${VECTOR_MATH_INCLUDE}

 SET(BulletSoftBodyDX11Solvers_SRCS
 	btSoftBodySolver_DX11.cpp
+	btSoftBodySolver_DX11SIMDAware.cpp
 )

 SET(BulletSoftBodyDX11Solvers_HDRS
 	btSoftBodySolver_DX11.h
+	btSoftBodySolver_DX11SIMDAware.h
 	../cpu/btSoftBodySolverData.h
 	btSoftBodySolverVertexData_DX11.h
 	btSoftBodySolverTriangleData_DX11.h
 	btSoftBodySolverLinkData_DX11.h
+	btSoftBodySolverLinkData_DX11SIMDAware.h
 	btSoftBodySolverBuffer_DX11.h
 	btSoftBodySolverVertexBuffer_DX11.h

@@ -37,6 +40,7 @@ SET(BulletSoftBodyDX11Solvers_Shaders
 	UpdatePositions
 	UpdateNodes
 	SolvePositions
+	SolvePositionsSIMDBatched
 	UpdatePositionsFromVelocities
 	ApplyForces
 	PrepareLinks
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositionsSIMDBatched.hlsl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositionsSIMDBatched.hlsl
@@ -0,0 +1,128 @@
+MSTRINGIFY(
+
+cbuffer SolvePositionsFromLinksKernelCB : register( b0 )
+{
+	int startWaveInBatch;
+	int numWaves;
+	float kst;		
+	float ti;
+};
+
+
+// Number of batches per wavefront stored one element per logical wavefront
+StructuredBuffer<int2> g_wavefrontBatchCountsVertexCounts : register( t0 );
+// Set of up to maxNumVertices vertex addresses per wavefront
+StructuredBuffer<int> g_vertexAddressesPerWavefront : register( t1 );
+
+StructuredBuffer<float> g_verticesInverseMass : register( t2 );
+
+// Per-link data layed out structured in terms of sub batches within wavefronts
+StructuredBuffer<int2> g_linksVertexIndices : register( t3 );
+StructuredBuffer<float> g_linksMassLSC : register( t4 );
+StructuredBuffer<float> g_linksRestLengthSquared : register( t5 );
+
+RWStructuredBuffer<float4> g_vertexPositions : register( u0 );
+
+// Data loaded on a per-wave basis
+groupshared int2 wavefrontBatchCountsVertexCounts[WAVEFRONT_BLOCK_MULTIPLIER];
+groupshared float4 vertexPositionSharedData[MAX_NUM_VERTICES_PER_WAVE*WAVEFRONT_BLOCK_MULTIPLIER];
+groupshared float vertexInverseMassSharedData[MAX_NUM_VERTICES_PER_WAVE*WAVEFRONT_BLOCK_MULTIPLIER];
+
+// Storing the vertex addresses actually slowed things down a little
+//groupshared int vertexAddressSharedData[MAX_NUM_VERTICES_PER_WAVE*WAVEFRONT_BLOCK_MULTIPLIER];
+
+
+[numthreads(BLOCK_SIZE, 1, 1)]
+void 
+SolvePositionsFromLinksKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	const int laneInWavefront = (DTid.x & (WAVEFRONT_SIZE-1));
+	const int wavefront = startWaveInBatch + (DTid.x / WAVEFRONT_SIZE);
+	const int firstWavefrontInBlock = startWaveInBatch + Gid.x * WAVEFRONT_BLOCK_MULTIPLIER;
+	const int localWavefront = wavefront - firstWavefrontInBlock;
+
+	// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier	
+	if( wavefront < (startWaveInBatch + numWaves) )
+	{
+
+		// Load the batch counts for the wavefronts
+		// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier
+		if( laneInWavefront == 0 )
+		{
+			int2 batchesAndVertexCountsWithinWavefront = g_wavefrontBatchCountsVertexCounts[firstWavefrontInBlock + localWavefront];
+			wavefrontBatchCountsVertexCounts[localWavefront] = batchesAndVertexCountsWithinWavefront;
+		}
+
+		
+		int2 batchesAndVerticesWithinWavefront = wavefrontBatchCountsVertexCounts[localWavefront];
+		int batchesWithinWavefront = batchesAndVerticesWithinWavefront.x;
+		int verticesUsedByWave = batchesAndVerticesWithinWavefront.y;
+
+		// Load the vertices for the wavefronts
+		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
+		{
+			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+
+			//vertexAddressSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = vertexAddress;
+			vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_vertexPositions[vertexAddress];
+			vertexInverseMassSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_verticesInverseMass[vertexAddress];
+		}
+		
+		// Loop through the batches performing the solve on each in LDS
+		int baseDataLocationForWave = WAVEFRONT_SIZE * wavefront * MAX_BATCHES_PER_WAVE;	
+
+		//for( int batch = 0; batch < batchesWithinWavefront; ++batch )
+		
+		int batch = 0;
+		do
+		{
+			int baseDataLocation = baseDataLocationForWave + WAVEFRONT_SIZE * batch;
+			int locationOfValue = baseDataLocation + laneInWavefront;
+			
+			
+			// These loads should all be perfectly linear across the WF
+			int2 localVertexIndices = g_linksVertexIndices[locationOfValue];
+			float massLSC = g_linksMassLSC[locationOfValue];
+			float restLengthSquared = g_linksRestLengthSquared[locationOfValue];
+			
+
+			// LDS vertex addresses based on logical wavefront number in block and loaded index
+			int vertexAddress0 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.x;
+			int vertexAddress1 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.y;
+			
+			float3 position0 = vertexPositionSharedData[vertexAddress0].xyz;
+			float3 position1 = vertexPositionSharedData[vertexAddress1].xyz;
+
+			float inverseMass0 = vertexInverseMassSharedData[vertexAddress0];
+			float inverseMass1 = vertexInverseMassSharedData[vertexAddress1]; 
+
+			float3 del = position1 - position0;
+			float len = dot(del, del);
+			
+			float k = 0;
+			if( massLSC > 0.0f )
+			{		
+				k = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
+			}
+			
+			position0 = position0 - del*(k*inverseMass0);
+			position1 = position1 + del*(k*inverseMass1);
+
+			vertexPositionSharedData[vertexAddress0] = float4(position0, 0.f);
+			vertexPositionSharedData[vertexAddress1] = float4(position1, 0.f);
+			
+			++batch;
+		} while( batch < batchesWithinWavefront );
+		
+		// Update the global memory vertices for the wavefronts
+		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
+		{
+			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+
+			g_vertexPositions[vertexAddress] = vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+		}
+	}
+		
+}
+
+);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11SIMDAware.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11SIMDAware.h
@@ -0,0 +1,173 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_DX11.h"
+
+#ifndef BT_ACCELERATED_SOFT_BODY_LINK_DATA_DX11_SIMDAWARE_H
+#define BT_ACCELERATED_SOFT_BODY_LINK_DATA_DX11_SIMDAWARE_H
+
+struct ID3D11Device;
+struct ID3D11DeviceContext;
+
+
+class btSoftBodyLinkDataDX11SIMDAware : public btSoftBodyLinkData
+{
+public:
+	bool				m_onGPU;
+	ID3D11Device		*m_d3dDevice;
+	ID3D11DeviceContext *m_d3dDeviceContext;
+
+	const int m_wavefrontSize;
+	const int m_linksPerWorkItem;
+	const int m_maxLinksPerWavefront;
+	int m_maxBatchesWithinWave;
+	int m_maxVerticesWithinWave;
+	int m_numWavefronts;
+
+	int m_maxVertex;
+
+	struct NumBatchesVerticesPair
+	{
+		int numBatches;
+		int numVertices;
+	};
+
+	// Array storing number of links in each wavefront
+	btAlignedObjectArray<int>									m_linksPerWavefront;
+	btAlignedObjectArray<NumBatchesVerticesPair>				m_numBatchesAndVerticesWithinWaves;
+	btDX11Buffer< NumBatchesVerticesPair >						m_dx11NumBatchesAndVerticesWithinWaves;
+
+	// All arrays here will contain batches of m_maxLinksPerWavefront links
+	// ordered by wavefront.
+	// with either global vertex pairs or local vertex pairs
+	btAlignedObjectArray< int >									m_wavefrontVerticesGlobalAddresses; // List of global vertices per wavefront
+	btDX11Buffer<int>											m_dx11WavefrontVerticesGlobalAddresses;
+	btAlignedObjectArray< LinkNodePair >						m_linkVerticesLocalAddresses; // Vertex pair for the link
+	btDX11Buffer<LinkNodePair>									m_dx11LinkVerticesLocalAddresses;
+	btDX11Buffer<float>											m_dx11LinkStrength;
+	btDX11Buffer<float>											m_dx11LinksMassLSC;
+	btDX11Buffer<float>											m_dx11LinksRestLengthSquared;
+	btDX11Buffer<float>											m_dx11LinksRestLength;
+	btDX11Buffer<float>											m_dx11LinksMaterialLinearStiffnessCoefficient;
+
+	struct BatchPair
+	{
+		int start;
+		int length;
+
+		BatchPair() :
+			start(0),
+			length(0)
+		{
+		}
+
+		BatchPair( int s, int l ) : 
+			start( s ),
+			length( l )
+		{
+		}
+	};
+
+	/**
+	 * Link addressing information for each cloth.
+	 * Allows link locations to be computed independently of data batching.
+	 */
+	btAlignedObjectArray< int >							m_linkAddresses;
+
+	/**
+	 * Start and length values for computation batches over link data.
+	 */
+	btAlignedObjectArray< BatchPair >		m_wavefrontBatchStartLengths;
+
+
+	//ID3D11Buffer*               readBackBuffer;
+	
+	btSoftBodyLinkDataDX11SIMDAware( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext );
+
+	virtual ~btSoftBodyLinkDataDX11SIMDAware();
+
+	/** Allocate enough space in all link-related arrays to fit numLinks links */
+	virtual void createLinks( int numLinks );
+	
+	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
+	virtual void setLinkAt( const LinkDescription &link, int linkIndex );
+
+	virtual bool onAccelerator();
+
+	virtual bool moveToAccelerator();
+
+	virtual bool moveFromAccelerator();
+
+	/**
+	 * Generate (and later update) the batching for the entire link set.
+	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
+	 * In theory we could delay it until just before we need the cloth.
+	 * It's a one-off overhead, though, so that is a later optimisation.
+	 */
+	void generateBatches();
+
+	int getMaxVerticesPerWavefront()
+	{
+		return m_maxVerticesWithinWave;
+	}
+
+	int getWavefrontSize()
+	{
+		return m_wavefrontSize;
+	}
+
+	int getLinksPerWorkItem()
+	{
+		return m_linksPerWorkItem;
+	}
+
+	int getMaxLinksPerWavefront()
+	{
+		return m_maxLinksPerWavefront;
+	}
+
+	int getMaxBatchesPerWavefront()
+	{
+		return m_maxBatchesWithinWave;
+	}
+
+	int getNumWavefronts()
+	{
+		return m_numWavefronts;
+	}
+
+	NumBatchesVerticesPair getNumBatchesAndVerticesWithinWavefront( int wavefront )
+	{
+		return m_numBatchesAndVerticesWithinWaves[wavefront];
+	}
+
+	int getVertexGlobalAddresses( int vertexIndex )
+	{
+		return m_wavefrontVerticesGlobalAddresses[vertexIndex];
+	}
+
+	/**
+	 * Get post-batching local addresses of the vertex pair for a link assuming all vertices used by a wavefront are loaded locally.
+	 */
+	LinkNodePair getVertexPairLocalAddresses( int linkIndex )
+	{
+		return m_linkVerticesLocalAddresses[linkIndex];
+	}
+
+};
+
+
+#endif // #ifndef BT_ACCELERATED_SOFT_BODY_LINK_DATA_DX11_SIMDAWARE_H
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
@@ -622,7 +622,7 @@ void btDX11SoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &softB
 			using Vectormath::Aos::Point3;

 			// Create SoftBody that will store the information within the solver
-			btAcceleratedSoftBodyInterface *newSoftBody = new btAcceleratedSoftBodyInterface( softBody );
+			btDX11AcceleratedSoftBodyInterface *newSoftBody = new btDX11AcceleratedSoftBodyInterface( softBody );
 			m_softBodySet.push_back( newSoftBody );

 			m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) );
@@ -1451,11 +1451,11 @@ void btDX11SoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities( float



-btDX11SoftBodySolver::btAcceleratedSoftBodyInterface *btDX11SoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
+btDX11AcceleratedSoftBodyInterface *btDX11SoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
 {
 	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
 	{
-		btAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
+		btDX11AcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
 		if( softBodyInterface->getSoftBody() == softBody )
 			return softBodyInterface;
 	}
@@ -1466,7 +1466,7 @@ void btDX11SoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * const
 {
 	checkInitialized();
 	
-	btAcceleratedSoftBodyInterface *currentCloth = findSoftBodyInterface( softBody );
+	btDX11AcceleratedSoftBodyInterface *currentCloth = findSoftBodyInterface( softBody );


 	const int firstVertex = currentCloth->getFirstVertex();
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
@@ -13,6 +13,9 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */

+#ifndef BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
+#define BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
+

 #include "vectormath/vmInclude.h"
 #include "BulletSoftBody/btSoftBodySolvers.h"
@@ -22,185 +25,184 @@ subject to the following restrictions:
 #include "btSoftBodySolverTriangleData_DX11.h"


-#ifndef BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
-#define BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
+
+/**
+ * SoftBody class to maintain information about a soft body instance
+ * within a solver.
+ * This data addresses the main solver arrays.
+ */
+class btDX11AcceleratedSoftBodyInterface
+{
+protected:
+	/** Current number of vertices that are part of this cloth */
+	int m_numVertices;
+	/** Maximum number of vertices allocated to be part of this cloth */
+	int m_maxVertices;
+	/** Current number of triangles that are part of this cloth */
+	int m_numTriangles;
+	/** Maximum number of triangles allocated to be part of this cloth */
+	int m_maxTriangles;
+	/** Index of first vertex in the world allocated to this cloth */
+	int m_firstVertex;
+	/** Index of first triangle in the world allocated to this cloth */
+	int m_firstTriangle;
+	/** Index of first link in the world allocated to this cloth */
+	int m_firstLink;
+	/** Maximum number of links allocated to this cloth */
+	int m_maxLinks;
+	/** Current number of links allocated to this cloth */
+	int m_numLinks;
+
+	/** The actual soft body this data represents */
+	btSoftBody *m_softBody;
+
+
+public:
+	btDX11AcceleratedSoftBodyInterface( btSoftBody *softBody ) :
+	  m_softBody( softBody )
+	{
+		m_numVertices = 0;
+		m_maxVertices = 0;
+		m_numTriangles = 0;
+		m_maxTriangles = 0;
+		m_firstVertex = 0;
+		m_firstTriangle = 0;
+		m_firstLink = 0;
+		m_maxLinks = 0;
+		m_numLinks = 0;
+	}
+	int getNumVertices()
+	{
+		return m_numVertices;
+	}
+
+	int getNumTriangles()
+	{
+		return m_numTriangles;
+	}
+
+	int getMaxVertices()
+	{
+		return m_maxVertices;
+	}
+
+	int getMaxTriangles()
+	{
+		return m_maxTriangles;
+	}
+
+	int getFirstVertex()
+	{
+		return m_firstVertex;
+	}
+
+	int getFirstTriangle()
+	{
+		return m_firstTriangle;
+	}
+
+	// TODO: All of these set functions will have to do checks and
+	// update the world because restructuring of the arrays will be necessary
+	// Reasonable use of "friend"?
+	void setNumVertices( int numVertices )
+	{
+		m_numVertices = numVertices;
+	}	
+	
+	void setNumTriangles( int numTriangles )
+	{
+		m_numTriangles = numTriangles;
+	}
+
+	void setMaxVertices( int maxVertices )
+	{
+		m_maxVertices = maxVertices;
+	}
+
+	void setMaxTriangles( int maxTriangles )
+	{
+		m_maxTriangles = maxTriangles;
+	}
+
+	void setFirstVertex( int firstVertex )
+	{
+		m_firstVertex = firstVertex;
+	}
+
+	void setFirstTriangle( int firstTriangle )
+	{
+		m_firstTriangle = firstTriangle;
+	}
+
+	void setMaxLinks( int maxLinks )
+	{
+		m_maxLinks = maxLinks;
+	}
+
+	void setNumLinks( int numLinks )
+	{
+		m_numLinks = numLinks;
+	}
+
+	void setFirstLink( int firstLink )
+	{
+		m_firstLink = firstLink;
+	}
+
+	int getMaxLinks()
+	{
+		return m_maxLinks;
+	}
+
+	int getNumLinks()
+	{
+		return m_numLinks;
+	}
+
+	int getFirstLink()
+	{
+		return m_firstLink;
+	}
+
+	btSoftBody* getSoftBody()
+	{
+		return m_softBody;
+	}
+
+#if 0
+	void setAcceleration( Vectormath::Aos::Vector3 acceleration )
+	{
+		m_currentSolver->setPerClothAcceleration( m_clothIdentifier, acceleration );
+	}
+
+	void setWindVelocity( Vectormath::Aos::Vector3 windVelocity )
+	{
+		m_currentSolver->setPerClothWindVelocity( m_clothIdentifier, windVelocity );
+	}
+
+	/** 
+	 * Set the density of the air in which the cloth is situated.
+	 */
+	void setAirDensity( btScalar density )
+	{
+		m_currentSolver->setPerClothMediumDensity( m_clothIdentifier, static_cast<float>(density) );
+	}
+
+	/**
+	 * Add a collision object to this soft body.
+	 */
+	void addCollisionObject( btCollisionObject *collisionObject )
+	{
+		m_currentSolver->addCollisionObjectForSoftBody( m_clothIdentifier, collisionObject );
+	}
+#endif
+};
+

 class btDX11SoftBodySolver : public btSoftBodySolver
 {
 public:

-	/**
-	 * SoftBody class to maintain information about a soft body instance
-	 * within a solver.
-	 * This data addresses the main solver arrays.
-	 */
-	class btAcceleratedSoftBodyInterface
-	{
-	protected:
-		/** Current number of vertices that are part of this cloth */
-		int m_numVertices;
-		/** Maximum number of vertices allocated to be part of this cloth */
-		int m_maxVertices;
-		/** Current number of triangles that are part of this cloth */
-		int m_numTriangles;
-		/** Maximum number of triangles allocated to be part of this cloth */
-		int m_maxTriangles;
-		/** Index of first vertex in the world allocated to this cloth */
-		int m_firstVertex;
-		/** Index of first triangle in the world allocated to this cloth */
-		int m_firstTriangle;
-		/** Index of first link in the world allocated to this cloth */
-		int m_firstLink;
-		/** Maximum number of links allocated to this cloth */
-		int m_maxLinks;
-		/** Current number of links allocated to this cloth */
-		int m_numLinks;
-
-		/** The actual soft body this data represents */
-		btSoftBody *m_softBody;
-
-
-	public:
-		btAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
-		  m_softBody( softBody )
-		{
-			m_numVertices = 0;
-			m_maxVertices = 0;
-			m_numTriangles = 0;
-			m_maxTriangles = 0;
-			m_firstVertex = 0;
-			m_firstTriangle = 0;
-			m_firstLink = 0;
-			m_maxLinks = 0;
-			m_numLinks = 0;
-		}
-		int getNumVertices()
-		{
-			return m_numVertices;
-		}
-
-		int getNumTriangles()
-		{
-			return m_numTriangles;
-		}
-
-		int getMaxVertices()
-		{
-			return m_maxVertices;
-		}
-
-		int getMaxTriangles()
-		{
-			return m_maxTriangles;
-		}
-
-		int getFirstVertex()
-		{
-			return m_firstVertex;
-		}
-
-		int getFirstTriangle()
-		{
-			return m_firstTriangle;
-		}
-
-		// TODO: All of these set functions will have to do checks and
-		// update the world because restructuring of the arrays will be necessary
-		// Reasonable use of "friend"?
-		void setNumVertices( int numVertices )
-		{
-			m_numVertices = numVertices;
-		}	
-		
-		void setNumTriangles( int numTriangles )
-		{
-			m_numTriangles = numTriangles;
-		}
-
-		void setMaxVertices( int maxVertices )
-		{
-			m_maxVertices = maxVertices;
-		}
-
-		void setMaxTriangles( int maxTriangles )
-		{
-			m_maxTriangles = maxTriangles;
-		}
-
-		void setFirstVertex( int firstVertex )
-		{
-			m_firstVertex = firstVertex;
-		}
-
-		void setFirstTriangle( int firstTriangle )
-		{
-			m_firstTriangle = firstTriangle;
-		}
-
-		void setMaxLinks( int maxLinks )
-		{
-			m_maxLinks = maxLinks;
-		}
-
-		void setNumLinks( int numLinks )
-		{
-			m_numLinks = numLinks;
-		}
-
-		void setFirstLink( int firstLink )
-		{
-			m_firstLink = firstLink;
-		}
-
-		int getMaxLinks()
-		{
-			return m_maxLinks;
-		}
-
-		int getNumLinks()
-		{
-			return m_numLinks;
-		}
-
-		int getFirstLink()
-		{
-			return m_firstLink;
-		}
-
-		btSoftBody* getSoftBody()
-		{
-			return m_softBody;
-		}
-
-	#if 0
-		void setAcceleration( Vectormath::Aos::Vector3 acceleration )
-		{
-			m_currentSolver->setPerClothAcceleration( m_clothIdentifier, acceleration );
-		}
-
-		void setWindVelocity( Vectormath::Aos::Vector3 windVelocity )
-		{
-			m_currentSolver->setPerClothWindVelocity( m_clothIdentifier, windVelocity );
-		}
-
-		/** 
-		 * Set the density of the air in which the cloth is situated.
-		 */
-		void setAirDensity( btScalar density )
-		{
-			m_currentSolver->setPerClothMediumDensity( m_clothIdentifier, static_cast<float>(density) );
-		}
-
-		/**
-		 * Add a collision object to this soft body.
-		 */
-		void addCollisionObject( btCollisionObject *collisionObject )
-		{
-			m_currentSolver->addCollisionObjectForSoftBody( m_clothIdentifier, collisionObject );
-		}
-	#endif
-	};
-

 	class KernelDesc
 	{
@@ -344,7 +346,7 @@ private:
 	 * Cloths owned by this solver.
 	 * Only our cloths are in this array.
 	 */
-	btAlignedObjectArray< btAcceleratedSoftBodyInterface * > m_softBodySet;
+	btAlignedObjectArray< btDX11AcceleratedSoftBodyInterface * > m_softBodySet;

 	/** Acceleration value to be applied to all non-static vertices in the solver. 
 	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
@@ -429,7 +431,7 @@ private:
 	
 	void updateConstants( float timeStep );

-	btAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
+	btDX11AcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );

 	//////////////////////////////////////
 	// Kernel dispatches
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.h
@@ -0,0 +1,432 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "vectormath/vmInclude.h"
+#include "BulletSoftBody/btSoftBodySolvers.h"
+#include "btSoftBodySolverVertexBuffer_DX11.h"
+#include "btSoftBodySolverLinkData_DX11SIMDAware.h"
+#include "btSoftBodySolverVertexData_DX11.h"
+#include "btSoftBodySolverTriangleData_DX11.h"
+
+
+#ifndef BT_SOFT_BODY_DX11_SOLVER_SIMDAWARE_H
+#define BT_SOFT_BODY_DX11_SOLVER_SIMDAWARE_H
+
+class btDX11SIMDAwareSoftBodySolver : public btSoftBodySolver
+{
+public:
+
+		/**
+	 * SoftBody class to maintain information about a soft body instance
+	 * within a solver.
+	 * This data addresses the main solver arrays.
+	 */
+	class btAcceleratedSoftBodyInterface
+	{
+	protected:
+		/** Current number of vertices that are part of this cloth */
+		int m_numVertices;
+		/** Maximum number of vertices allocated to be part of this cloth */
+		int m_maxVertices;
+		/** Current number of triangles that are part of this cloth */
+		int m_numTriangles;
+		/** Maximum number of triangles allocated to be part of this cloth */
+		int m_maxTriangles;
+		/** Index of first vertex in the world allocated to this cloth */
+		int m_firstVertex;
+		/** Index of first triangle in the world allocated to this cloth */
+		int m_firstTriangle;
+		/** Index of first link in the world allocated to this cloth */
+		int m_firstLink;
+		/** Maximum number of links allocated to this cloth */
+		int m_maxLinks;
+		/** Current number of links allocated to this cloth */
+		int m_numLinks;
+
+		/** The actual soft body this data represents */
+		btSoftBody *m_softBody;
+
+
+	public:
+		btAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
+		  m_softBody( softBody )
+		{
+			m_numVertices = 0;
+			m_maxVertices = 0;
+			m_numTriangles = 0;
+			m_maxTriangles = 0;
+			m_firstVertex = 0;
+			m_firstTriangle = 0;
+			m_firstLink = 0;
+			m_maxLinks = 0;
+			m_numLinks = 0;
+		}
+		int getNumVertices()
+		{
+			return m_numVertices;
+		}
+
+		int getNumTriangles()
+		{
+			return m_numTriangles;
+		}
+
+		int getMaxVertices()
+		{
+			return m_maxVertices;
+		}
+
+		int getMaxTriangles()
+		{
+			return m_maxTriangles;
+		}
+
+		int getFirstVertex()
+		{
+			return m_firstVertex;
+		}
+
+		int getFirstTriangle()
+		{
+			return m_firstTriangle;
+		}
+
+
+		void setNumVertices( int numVertices )
+		{
+			m_numVertices = numVertices;
+		}	
+		
+		void setNumTriangles( int numTriangles )
+		{
+			m_numTriangles = numTriangles;
+		}
+
+		void setMaxVertices( int maxVertices )
+		{
+			m_maxVertices = maxVertices;
+		}
+
+		void setMaxTriangles( int maxTriangles )
+		{
+			m_maxTriangles = maxTriangles;
+		}
+
+		void setFirstVertex( int firstVertex )
+		{
+			m_firstVertex = firstVertex;
+		}
+
+		void setFirstTriangle( int firstTriangle )
+		{
+			m_firstTriangle = firstTriangle;
+		}
+
+		void setMaxLinks( int maxLinks )
+		{
+			m_maxLinks = maxLinks;
+		}
+
+		void setNumLinks( int numLinks )
+		{
+			m_numLinks = numLinks;
+		}
+
+		void setFirstLink( int firstLink )
+		{
+			m_firstLink = firstLink;
+		}
+
+		int getMaxLinks()
+		{
+			return m_maxLinks;
+		}
+
+		int getNumLinks()
+		{
+			return m_numLinks;
+		}
+
+		int getFirstLink()
+		{
+			return m_firstLink;
+		}
+
+		btSoftBody* getSoftBody()
+		{
+			return m_softBody;
+		}
+
+	};
+
+
+	class KernelDesc
+	{
+	protected:
+		
+
+	public:
+		ID3D11ComputeShader* kernel;
+		ID3D11Buffer* constBuffer;
+
+		KernelDesc()
+		{
+			kernel = 0;
+			constBuffer = 0;
+		}
+
+		virtual ~KernelDesc()
+		{
+			// TODO: this should probably destroy its kernel but we need to be careful
+			// in case KernelDescs are copied
+		}
+	}; 
+
+	struct SolvePositionsFromLinksKernelCB
+	{		
+		int startWave;
+		int numWaves;
+		float kst;
+		float ti;
+	};
+
+	struct IntegrateCB
+	{
+		int numNodes;
+		float solverdt;
+		int padding1;
+		int padding2;
+	};
+
+	struct UpdatePositionsFromVelocitiesCB
+	{
+		int numNodes;
+		float solverSDT;
+		int padding1;
+		int padding2;
+	};
+
+	struct UpdateVelocitiesFromPositionsWithoutVelocitiesCB
+	{
+		int numNodes;
+		float isolverdt;
+		int padding1;
+		int padding2;
+	};
+
+	struct UpdateVelocitiesFromPositionsWithVelocitiesCB
+	{
+		int numNodes;
+		float isolverdt;
+		int padding1;
+		int padding2;
+	};
+
+	struct UpdateSoftBodiesCB
+	{
+		int numNodes;
+		int startFace;
+		int numFaces;
+		float epsilon;
+	};
+
+
+	struct OutputToVertexArrayCB
+	{
+		int startNode;
+		int numNodes;
+		int positionOffset;
+		int positionStride;
+		
+		int normalOffset;	
+		int normalStride;
+		int padding1;
+		int padding2;
+	};
+
+
+	struct ApplyForcesCB
+	{
+		unsigned int numNodes;
+		float solverdt;
+		float epsilon;
+		int padding3;
+	};
+
+	struct AddVelocityCB
+	{
+		int startNode;
+		int lastNode;
+		float velocityX;
+		float velocityY;
+		float velocityZ;
+		int padding1;
+		int padding2;
+		int padding3;
+	};
+
+
+private:
+	ID3D11Device *		 m_dx11Device;
+	ID3D11DeviceContext* m_dx11Context;
+
+
+	/** Link data for all cloths. Note that this will be sorted batch-wise for efficient computation and m_linkAddresses will maintain the addressing. */
+	btSoftBodyLinkDataDX11SIMDAware m_linkData;
+	btSoftBodyVertexDataDX11 m_vertexData;
+	btSoftBodyTriangleDataDX11 m_triangleData;
+		
+	/** Variable to define whether we need to update solver constants on the next iteration */
+	bool m_updateSolverConstants;
+
+	bool m_shadersInitialized;
+	
+	/** 
+	 * Cloths owned by this solver.
+	 * Only our cloths are in this array.
+	 */
+	btAlignedObjectArray< btAcceleratedSoftBodyInterface * > m_softBodySet;
+
+	/** Acceleration value to be applied to all non-static vertices in the solver. 
+	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
+	 */
+	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_perClothAcceleration;
+	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11PerClothAcceleration;
+
+	/** Wind velocity to be applied normal to all non-static vertices in the solver. 
+	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
+	 */
+	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_perClothWindVelocity;
+	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11PerClothWindVelocity;
+
+	/** Velocity damping factor */
+	btAlignedObjectArray< float >						m_perClothDampingFactor;
+	btDX11Buffer<float>									m_dx11PerClothDampingFactor;
+
+	/** Velocity correction coefficient */
+	btAlignedObjectArray< float >						m_perClothVelocityCorrectionCoefficient;
+	btDX11Buffer<float>									m_dx11PerClothVelocityCorrectionCoefficient;
+
+	/** Lift parameter for wind effect on cloth. */
+	btAlignedObjectArray< float >						m_perClothLiftFactor;
+	btDX11Buffer<float>									m_dx11PerClothLiftFactor;
+	
+	/** Drag parameter for wind effect on cloth. */
+	btAlignedObjectArray< float >						m_perClothDragFactor;
+	btDX11Buffer<float>									m_dx11PerClothDragFactor;
+
+	/** Density of the medium in which each cloth sits */
+	btAlignedObjectArray< float >						m_perClothMediumDensity;
+	btDX11Buffer<float>									m_dx11PerClothMediumDensity;
+
+	KernelDesc		solvePositionsFromLinksKernel;
+	KernelDesc		integrateKernel;
+	KernelDesc		addVelocityKernel;
+	KernelDesc		updatePositionsFromVelocitiesKernel;
+	KernelDesc		updateVelocitiesFromPositionsWithoutVelocitiesKernel;
+	KernelDesc		updateVelocitiesFromPositionsWithVelocitiesKernel;
+	KernelDesc		resetNormalsAndAreasKernel;
+	KernelDesc		normalizeNormalsAndAreasKernel;
+	KernelDesc		updateSoftBodiesKernel;
+	KernelDesc		outputToVertexArrayWithNormalsKernel;
+	KernelDesc		outputToVertexArrayWithoutNormalsKernel;
+
+	KernelDesc		outputToVertexArrayKernel;
+	KernelDesc		applyForcesKernel;
+	KernelDesc		collideSphereKernel;
+	KernelDesc		collideCylinderKernel;
+
+
+	
+	/**
+	 * Integrate motion on the solver.
+	 */
+	virtual void integrate( float solverdt );
+	float computeTriangleArea( 
+		const Vectormath::Aos::Point3 &vertex0,
+		const Vectormath::Aos::Point3 &vertex1,
+		const Vectormath::Aos::Point3 &vertex2 );
+
+
+	/**
+	 * Compile a compute shader kernel from a string and return the appropriate KernelDesc object.
+	 */
+	KernelDesc compileComputeShaderFromString( const char* shaderString, const char* shaderName, int constBufferSize, D3D10_SHADER_MACRO *compileMacros = 0 );
+
+	bool buildShaders();
+
+	void resetNormalsAndAreas( int numVertices );
+
+	void normalizeNormalsAndAreas( int numVertices );
+
+	void executeUpdateSoftBodies( int firstTriangle, int numTriangles );
+
+	Vectormath::Aos::Vector3 ProjectOnAxis( const Vectormath::Aos::Vector3 &v, const Vectormath::Aos::Vector3 &a );
+
+	void ApplyClampedForce( float solverdt, const Vectormath::Aos::Vector3 &force, const Vectormath::Aos::Vector3 &vertexVelocity, float inverseMass, Vectormath::Aos::Vector3 &vertexForce );
+
+	virtual void applyForces( float solverdt );
+	
+	void updateConstants( float timeStep );
+
+	btAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
+
+	//////////////////////////////////////
+	// Kernel dispatches
+	void prepareLinks();
+
+	void updatePositionsFromVelocities( float solverdt );
+	void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
+	void solveLinksForVelocity( int startLink, int numLinks, float kst );
+	
+	void updateVelocitiesFromPositionsWithVelocities( float isolverdt );
+	void updateVelocitiesFromPositionsWithoutVelocities( float isolverdt );
+
+	// End kernel dispatches
+	/////////////////////////////////////
+
+	void releaseKernels();
+
+
+public:
+	btDX11SIMDAwareSoftBodySolver(ID3D11Device * dx11Device, ID3D11DeviceContext* dx11Context);
+
+	virtual ~btDX11SIMDAwareSoftBodySolver();
+	
+
+
+	virtual btSoftBodyLinkData &getLinkData();
+
+	virtual btSoftBodyVertexData &getVertexData();
+
+	virtual btSoftBodyTriangleData &getTriangleData();
+
+
+
+	virtual bool checkInitialized();
+
+	virtual void updateSoftBodies( );
+
+	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies );
+
+	virtual void solveConstraints( float solverdt );
+
+	virtual void predictMotion( float solverdt );
+
+	virtual void copySoftBodyToVertexBuffer( const btSoftBody *const softBody, btVertexBufferDescriptor *vertexBuffer );
+};
+
+#endif // #ifndef BT_SOFT_BODY_DX11_SOLVER_SIMDAWARE_H
+
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt
@@ -0,0 +1,82 @@
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+ADD_DEFINITIONS(-DUSE_AMD_OPENCL)
+ADD_DEFINITIONS(-DCL_PLATFORM_AMD)
+
+
+IF (INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+	INCLUDE_DIRECTORIES(		$ENV{==ATISTREAMSDKROOT=}/include )
+ELSE()
+	INCLUDE_DIRECTORIES(		$ENV{ATISTREAMSDKROOT}/include	)
+ENDIF()
+
+
+
+SET(BulletSoftBodyOpenCLSolvers_SRCS
+	../btSoftBodySolver_OpenCL.cpp
+)
+
+SET(BulletSoftBodyOpenCLSolvers_HDRS
+	../btSoftBodySolver_OpenCL.h
+	../../CPU/btSoftBodySolverData.h
+	../btSoftBodySolverVertexData_OpenCL.h
+	../btSoftBodySolverTriangleData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverBuffer_OpenCL.h
+)
+
+# OpenCL and HLSL Shaders.
+# Build rules generated to stringify these into headers
+# which are needed by some of the sources
+SET(BulletSoftBodyOpenCLSolvers_Shaders
+#	OutputToVertexArray
+	UpdateNormals
+	Integrate
+	UpdatePositions
+	UpdateNodes
+	SolvePositions
+	UpdatePositionsFromVelocities
+	ApplyForces
+	PrepareLinks
+	VSolveLinks
+)
+
+foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC/${f}.cl")
+endforeach(f) 
+
+
+
+ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_AMD
+	${BulletSoftBodyOpenCLSolvers_SRCS} 
+	${BulletSoftBodyOpenCLSolvers_HDRS} 
+	${BulletSoftBodyOpenCLSolvers_OpenCLC}
+)
+
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES SOVERSION ${BULLET_VERSION})
+IF (BUILD_SHARED_LIBS)
+	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
+ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_AMD DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_AMD DESTINATION lib${LIB_SUFFIX})
+				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Apple/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Apple/CMakeLists.txt
@@ -0,0 +1,73 @@
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+
+
+
+SET(BulletSoftBodyOpenCLSolvers_SRCS
+	../btSoftBodySolver_OpenCL.cpp
+)
+
+SET(BulletSoftBodyOpenCLSolvers_HDRS
+	../btSoftBodySolver_OpenCL.h
+	../../CPU/btSoftBodySolverData.h
+	../btSoftBodySolverVertexData_OpenCL.h
+	../btSoftBodySolverTriangleData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverBuffer_OpenCL.h
+)
+
+# OpenCL and HLSL Shaders.
+# Build rules generated to stringify these into headers
+# which are needed by some of the sources
+SET(BulletSoftBodyOpenCLSolvers_Shaders
+#	OutputToVertexArray
+	UpdateNormals
+	Integrate
+	UpdatePositions
+	UpdateNodes
+	SolvePositions
+	UpdatePositionsFromVelocities
+	ApplyForces
+	PrepareLinks
+	VSolveLinks
+)
+
+foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC10/${f}.cl")
+endforeach(f) 
+
+
+
+ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_Apple
+	${BulletSoftBodyOpenCLSolvers_SRCS} 
+	${BulletSoftBodyOpenCLSolvers_HDRS} 
+	${BulletSoftBodyOpenCLSolvers_OpenCLC}
+)
+
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES SOVERSION ${BULLET_VERSION})
+IF (BUILD_SHARED_LIBS)
+	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
+ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Apple  DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Apple  DESTINATION lib${LIB_SUFFIX})
+				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/CMakeLists.txt
@@ -1,71 +1,16 @@

-INCLUDE_DIRECTORIES(
-${BULLET_PHYSICS_SOURCE_DIR}/src
-)
+IF(BUILD_MINICL_OPENCL_DEMOS)
+	SUBDIRS( MiniCL  )
+ENDIF()

+IF(BUILD_AMD_OPENCL_DEMOS)
+	SUBDIRS(AMD)
+ENDIF()

-SET(OPENCL_DIR $ENV{ATISTREAMSDKROOT})
-SET(OPENCL_INCLUDE_PATH "${ATISTREAMSDKROOT}/include" CACHE DOCSTRING "OpenCL SDK include path")
+IF(BUILD_NVIDIA_OPENCL_DEMOS)
+	SUBDIRS(NVidia)
+ENDIF()

-INCLUDE_DIRECTORIES(${OPENCL_INCLUDE_PATH} "../cpu/")
-
-SET(BulletSoftBodyOpenCLSolvers_SRCS
-	btSoftBodySolver_OpenCL.cpp
-)
-
-SET(BulletSoftBodyOpenCLSolvers_HDRS
-	btSoftBodySolver_OpenCL.h
-	../cpu/btSoftBodySolverData.h
-	btSoftBodySolverVertexData_OpenCL.h
-	btSoftBodySolverTriangleData_OpenCL.h
-	btSoftBodySolverLinkData_OpenCL.h
-	btSoftBodySolverBuffer_OpenCL.h
-)
-
-# OpenCL and HLSL Shaders.
-# Build rules generated to stringify these into headers
-# which are needed by some of the sources
-SET(BulletSoftBodyOpenCLSolvers_Shaders
-#	OutputToVertexArray
-	UpdateNormals
-	Integrate
-	UpdatePositions
-	UpdateNodes
-	SolvePositions
-	UpdatePositionsFromVelocities
-	ApplyForces
-	PrepareLinks
-	VSolveLinks
-)
-
-foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
-    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "OpenCLC/${f}.cl")
-endforeach(f) 
-
-
-
-ADD_LIBRARY(BulletSoftBodySolvers_OpenCL  ${BulletSoftBodyOpenCLSolvers_SRCS} ${BulletSoftBodyOpenCLSolvers_HDRS} ${BulletSoftBodyOpenCLSolvers_OpenCLC})
-SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL PROPERTIES VERSION ${BULLET_VERSION})
-SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL PROPERTIES SOVERSION ${BULLET_VERSION})
-IF (BUILD_SHARED_LIBS)
-	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
-ENDIF (BUILD_SHARED_LIBS)
-
-
-IF (INSTALL_LIBS)
-	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
-		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
-			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL DESTINATION .)
-			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL DESTINATION lib${LIB_SUFFIX})
-				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
-			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
-
-		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL PROPERTIES FRAMEWORK true)
-			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
-		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
-ENDIF (INSTALL_LIBS)
+IF(APPLE)
+	SUBDIRS(Apple)
+ENDIF()
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/CMakeLists.txt
@@ -0,0 +1,75 @@
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+ADD_DEFINITIONS(-DUSE_MINICL)
+
+
+
+
+SET(BulletSoftBodyOpenCLSolvers_SRCS
+	../btSoftBodySolver_OpenCL.cpp
+)
+
+SET(BulletSoftBodyOpenCLSolvers_HDRS
+	../btSoftBodySolver_OpenCL.h
+	../../CPU/btSoftBodySolverData.h
+	../btSoftBodySolverVertexData_OpenCL.h
+	../btSoftBodySolverTriangleData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverBuffer_OpenCL.h
+)
+
+# OpenCL and HLSL Shaders.
+# Build rules generated to stringify these into headers
+# which are needed by some of the sources
+SET(BulletSoftBodyOpenCLSolvers_Shaders
+#	OutputToVertexArray
+	UpdateNormals
+	Integrate
+	UpdatePositions
+	UpdateNodes
+	SolvePositions
+	UpdatePositionsFromVelocities
+	ApplyForces
+	PrepareLinks
+	VSolveLinks
+)
+
+foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC10/${f}.cl")
+endforeach(f) 
+
+
+
+ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_Mini
+	${BulletSoftBodyOpenCLSolvers_SRCS} 
+	${BulletSoftBodyOpenCLSolvers_HDRS} 
+	${BulletSoftBodyOpenCLSolvers_OpenCLC}
+)
+
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES SOVERSION ${BULLET_VERSION})
+IF (BUILD_SHARED_LIBS)
+	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
+ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Mini DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Mini DESTINATION lib${LIB_SUFFIX})
+				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
@@ -0,0 +1,40 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include <MiniCL/cl_MiniCL_Defs.h>
+
+#define MSTRINGIFY(A) A
+#include "../OpenCLC10/ApplyForces.cl"
+#include "../OpenCLC10/Integrate.cl"
+#include "../OpenCLC10/PrepareLinks.cl"
+#include "../OpenCLC10/SolvePositions.cl"
+#include "../OpenCLC10/UpdateNodes.cl"
+#include "../OpenCLC10/UpdateNormals.cl"
+#include "../OpenCLC10/UpdatePositions.cl"
+#include "../OpenCLC10/UpdatePositionsFromVelocities.cl"
+//#include "../OpenCLC10/VSolveLinks.cl"
+
+MINICL_REGISTER(PrepareLinksKernel)
+MINICL_REGISTER(UpdatePositionsFromVelocitiesKernel)
+MINICL_REGISTER(SolvePositionsFromLinksKernel)
+MINICL_REGISTER(updateVelocitiesFromPositionsWithVelocitiesKernel)
+MINICL_REGISTER(updateVelocitiesFromPositionsWithoutVelocitiesKernel)
+MINICL_REGISTER(IntegrateKernel)
+MINICL_REGISTER(ApplyForcesKernel)
+MINICL_REGISTER(ResetNormalsAndAreasKernel)
+MINICL_REGISTER(NormalizeNormalsAndAreasKernel)
+MINICL_REGISTER(UpdateSoftBodiesKernel)
+
+
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt
@@ -0,0 +1,79 @@
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+
+
+IF(INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+	INCLUDE_DIRECTORIES( $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/inc	)
+ELSE()
+	INCLUDE_DIRECTORIES( $ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/inc	)
+ENDIF()
+
+
+SET(BulletSoftBodyOpenCLSolvers_SRCS
+	../btSoftBodySolver_OpenCL.cpp
+)
+
+SET(BulletSoftBodyOpenCLSolvers_HDRS
+	../btSoftBodySolver_OpenCL.h
+	../../CPU/btSoftBodySolverData.h
+	../btSoftBodySolverVertexData_OpenCL.h
+	../btSoftBodySolverTriangleData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverBuffer_OpenCL.h
+)
+
+# OpenCL and HLSL Shaders.
+# Build rules generated to stringify these into headers
+# which are needed by some of the sources
+SET(BulletSoftBodyOpenCLSolvers_Shaders
+#	OutputToVertexArray
+	UpdateNormals
+	Integrate
+	UpdatePositions
+	UpdateNodes
+	SolvePositions
+	UpdatePositionsFromVelocities
+	ApplyForces
+	PrepareLinks
+	VSolveLinks
+)
+
+foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC/${f}.cl")
+endforeach(f) 
+
+
+
+ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_NVidia
+	${BulletSoftBodyOpenCLSolvers_SRCS} 
+	${BulletSoftBodyOpenCLSolvers_HDRS} 
+	${BulletSoftBodyOpenCLSolvers_OpenCLC}
+)
+
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES SOVERSION ${BULLET_VERSION})
+IF (BUILD_SHARED_LIBS)
+	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
+ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_NVidia DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_NVidia DESTINATION lib${LIB_SUFFIX})
+				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ApplyForces.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ApplyForces.cl
@@ -0,0 +1,91 @@
+MSTRINGIFY(
+
+
+float adot3(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+float4 projectOnAxis( float4 v, float4 a )
+{
+	return (a*adot3(v, a));
+}
+
+__kernel void 
+ApplyForcesKernel(
+	const uint numNodes,
+	const float solverdt,
+	const float epsilon,
+	__global int * g_vertexClothIdentifier,
+	__global float4 * g_vertexNormal,
+	__global float * g_vertexArea,
+	__global float * g_vertexInverseMass,
+	__global float * g_clothLiftFactor,
+	__global float * g_clothDragFactor,
+	__global float4 * g_clothWindVelocity,
+	__global float4 * g_clothAcceleration,
+	__global float * g_clothMediumDensity,
+	__global float4 * g_vertexForceAccumulator,
+	__global float4 * g_vertexVelocity GUID_ARG)
+{
+	unsigned int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{		
+		int clothId  = g_vertexClothIdentifier[nodeID];
+		float nodeIM = g_vertexInverseMass[nodeID];
+		
+		if( nodeIM > 0.0f )
+		{
+			float4 nodeV  = g_vertexVelocity[nodeID];
+			float4 normal = g_vertexNormal[nodeID];
+			float area    = g_vertexArea[nodeID];
+			float4 nodeF  = g_vertexForceAccumulator[nodeID];
+			
+			// Read per-cloth values
+			float4 clothAcceleration = g_clothAcceleration[clothId];
+			float4 clothWindVelocity = g_clothWindVelocity[clothId];
+			float liftFactor = g_clothLiftFactor[clothId];
+			float dragFactor = g_clothDragFactor[clothId];
+			float mediumDensity = g_clothMediumDensity[clothId];
+		
+			// Apply the acceleration to the cloth rather than do this via a force
+			nodeV += (clothAcceleration*solverdt);
+
+			g_vertexVelocity[nodeID] = nodeV;
+
+			float4 relativeWindVelocity = nodeV - clothWindVelocity;
+			float relativeSpeedSquared = dot(relativeWindVelocity, relativeWindVelocity);
+			
+			if( relativeSpeedSquared > epsilon )
+			{
+				// Correct direction of normal relative to wind direction and get dot product
+				normal = normal * (dot(normal, relativeWindVelocity) < 0 ? -1.f : 1.f);
+				float dvNormal = dot(normal, relativeWindVelocity);
+				if( dvNormal > 0 )
+				{
+					float4 force = (float4)(0.f, 0.f, 0.f, 0.f);
+					float c0 = area * dvNormal * relativeSpeedSquared / 2.f;
+					float c1 = c0 * mediumDensity;
+					force += normal * (-c1 * liftFactor);
+					force += normalize(relativeWindVelocity)*(-c1 * dragFactor);
+					
+					float dtim = solverdt * nodeIM;
+					float4 forceDTIM = force * dtim;
+					
+					float4 nodeFPlusForce = nodeF + force;
+					
+					// m_nodesf[i] -= ProjectOnAxis(m_nodesv[i], force.normalized())/dtim;	
+					float4 nodeFMinus = nodeF - (projectOnAxis(nodeV, normalize(force))/dtim);
+					
+					nodeF = nodeFPlusForce;
+					if( dot(forceDTIM, forceDTIM) > dot(nodeV, nodeV) )
+						nodeF = nodeFMinus;
+									
+					g_vertexForceAccumulator[nodeID] = nodeF;	
+				}
+			}
+		}
+	}
+}
+
+);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/Integrate.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/Integrate.cl
@@ -0,0 +1,35 @@
+MSTRINGIFY(
+
+// Node indices for each link
+
+
+
+__kernel void
+IntegrateKernel( 
+	const int numNodes,
+	const float solverdt,
+	__global float * g_vertexInverseMasses,
+	__global float4 * g_vertexPositions,
+	__global float4 * g_vertexVelocity,
+	__global float4 * g_vertexPreviousPositions,
+	__global float4 * g_vertexForceAccumulator GUID_ARG)
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{	
+		float4 position   = g_vertexPositions[nodeID];
+		float4 velocity   = g_vertexVelocity[nodeID];
+		float4 force      = g_vertexForceAccumulator[nodeID];
+		float inverseMass = g_vertexInverseMasses[nodeID];
+		
+		g_vertexPreviousPositions[nodeID] = position;
+		velocity += force * inverseMass * solverdt;
+		position += velocity * solverdt;
+		
+		g_vertexForceAccumulator[nodeID] = (float4)(0.f, 0.f, 0.f, 0.0f);
+		g_vertexPositions[nodeID]        = position;
+		g_vertexVelocity[nodeID]         = velocity;	
+	}
+}
+
+);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/PrepareLinks.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/PrepareLinks.cl
@@ -0,0 +1,41 @@
+MSTRINGIFY(
+
+float dot3(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+
+__kernel void 
+PrepareLinksKernel( 
+	const int numLinks,
+	__global int2 * g_linksVertexIndices,
+	__global float * g_linksMassLSC,
+	__global float4 * g_nodesPreviousPosition,
+	__global float * g_linksLengthRatio,
+	__global float4 * g_linksCurrentLength GUID_ARG)
+{
+	int linkID = get_global_id(0);
+	if( linkID < numLinks )
+	{	
+		
+		int2 nodeIndices = g_linksVertexIndices[linkID];
+		int node0 = nodeIndices.x;
+		int node1 = nodeIndices.y;
+		
+		float4 nodePreviousPosition0 = g_nodesPreviousPosition[node0];
+		float4 nodePreviousPosition1 = g_nodesPreviousPosition[node1];
+
+		float massLSC = g_linksMassLSC[linkID];
+		
+		float4 linkCurrentLength = nodePreviousPosition1 - nodePreviousPosition0;
+		
+		float linkLengthRatio = dot3(linkCurrentLength, linkCurrentLength)*massLSC;
+		linkLengthRatio = 1.0f/linkLengthRatio;
+		
+		g_linksCurrentLength[linkID] = linkCurrentLength;
+		g_linksLengthRatio[linkID]   = linkLengthRatio;		
+	}
+}
+
+);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositions.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositions.cl
@@ -0,0 +1,57 @@
+
+
+
+MSTRINGIFY(
+
+
+float mydot3(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+
+__kernel void 
+SolvePositionsFromLinksKernel( 
+	const int startLink,
+	const int numLinks,
+	const float kst,
+	const float ti,
+	__global int2 * g_linksVertexIndices,
+	__global float * g_linksMassLSC,
+	__global float * g_linksRestLengthSquared,
+	__global float * g_verticesInverseMass,
+	__global float4 * g_vertexPositions GUID_ARG)
+	
+{
+	int linkID = get_global_id(0) + startLink;
+	if( get_global_id(0) < numLinks )
+	{	
+		float massLSC = g_linksMassLSC[linkID];
+		float restLengthSquared = g_linksRestLengthSquared[linkID];
+		
+		if( massLSC > 0.0f )
+		{		
+			int2 nodeIndices = g_linksVertexIndices[linkID];
+			int node0 = nodeIndices.x;
+			int node1 = nodeIndices.y;
+			
+			float4 position0 = g_vertexPositions[node0];
+			float4 position1 = g_vertexPositions[node1];
+
+			float inverseMass0 = g_verticesInverseMass[node0];
+			float inverseMass1 = g_verticesInverseMass[node1]; 
+
+			float4 del = position1 - position0;
+			float len  = mydot3(del, del);
+			float k    = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
+			position0 = position0 - del*(k*inverseMass0);
+			position1 = position1 + del*(k*inverseMass1);
+
+			g_vertexPositions[node0] = position0;
+			g_vertexPositions[node1] = position1;
+
+		}
+	}
+}
+
+);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateConstants.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateConstants.cl
@@ -0,0 +1,44 @@
+MSTRINGIFY(
+
+/*#define float3 float4
+
+float dot3(float3 a, float3 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}*/
+
+__kernel void 
+UpdateConstantsKernel( 
+	const int numLinks,
+	__global int2 * g_linksVertexIndices,
+	__global float4 * g_vertexPositions,
+	__global float * g_vertexInverseMasses,
+	__global float * g_linksMaterialLSC,
+	__global float * g_linksMassLSC,
+	__global float * g_linksRestLengthSquared,
+	__global float * g_linksRestLengths)
+{
+	int linkID = get_global_id(0);
+	if( linkID < numLinks )
+	{	
+		int2 nodeIndices = g_linksVertexIndices[linkID];
+		int node0 = nodeIndices.x;
+		int node1 = nodeIndices.y;
+		float linearStiffnessCoefficient = g_linksMaterialLSC[ linkID ];
+		
+		float3 position0   = g_vertexPositions[node0].xyz;
+		float3 position1   = g_vertexPositions[node1].xyz;
+		float inverseMass0 = g_vertexInverseMasses[node0];
+		float inverseMass1 = g_vertexInverseMasses[node1];
+
+		float3 difference = position0 - position1;
+		float length2 = dot(difference, difference);
+		float length = sqrt(length2);
+	
+		g_linksRestLengths[linkID] = length;
+		g_linksMassLSC[linkID] = (inverseMass0 + inverseMass1)/linearStiffnessCoefficient;
+		g_linksRestLengthSquared[linkID] = length*length;		
+	}
+}
+
+);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNodes.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNodes.cl
@@ -0,0 +1,39 @@
+MSTRINGIFY(
+
+
+__kernel void 
+updateVelocitiesFromPositionsWithVelocitiesKernel( 
+	int numNodes,
+	float isolverdt,
+	__global float4 * g_vertexPositions,
+	__global float4 * g_vertexPreviousPositions,
+	__global int * g_vertexClothIndices,
+	__global float *g_clothVelocityCorrectionCoefficients,
+	__global float * g_clothDampingFactor,
+	__global float4 * g_vertexVelocities,
+	__global float4 * g_vertexForces GUID_ARG)
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{	
+		float4 position = g_vertexPositions[nodeID];
+		float4 previousPosition = g_vertexPreviousPositions[nodeID];
+		float4 velocity = g_vertexVelocities[nodeID];
+		int clothIndex = g_vertexClothIndices[nodeID];
+		float velocityCorrectionCoefficient = g_clothVelocityCorrectionCoefficients[clothIndex];
+		float dampingFactor = g_clothDampingFactor[clothIndex];
+		float velocityCoefficient = (1.f - dampingFactor);
+		
+		float4 difference = position - previousPosition;
+				
+		velocity += difference*velocityCorrectionCoefficient*isolverdt;
+		
+		// Damp the velocity
+		velocity *= velocityCoefficient;
+		
+		g_vertexVelocities[nodeID] = velocity;
+		g_vertexForces[nodeID] = (float4)(0.f, 0.f, 0.f, 0.f);								
+	}
+}
+
+);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNormals.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNormals.cl
@@ -0,0 +1,102 @@
+MSTRINGIFY(
+
+float length3(float4 a)
+{
+	a.w = 0;
+	return length(a);
+}
+
+float4 normalize3(float4 a)
+{
+	a.w = 0;
+	return normalize(a);
+}
+
+__kernel void 
+ResetNormalsAndAreasKernel(
+	const unsigned int numNodes,
+	__global float4 * g_vertexNormals,
+	__global float * g_vertexArea GUID_ARG)
+{
+	if( get_global_id(0) < numNodes )
+	{
+		g_vertexNormals[get_global_id(0)] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
+		g_vertexArea[get_global_id(0)]    = 0.0f;
+	}
+}
+
+
+__kernel void 
+UpdateSoftBodiesKernel(
+	const unsigned int startFace,
+	const unsigned int numFaces,
+	__global int4 * g_triangleVertexIndexSet,
+	__global float4 * g_vertexPositions,
+	__global float4 * g_vertexNormals,
+	__global float * g_vertexArea,
+	__global float4 * g_triangleNormals,
+	__global float * g_triangleArea GUID_ARG)
+{
+	int faceID = get_global_id(0) + startFace;
+	if( get_global_id(0) < numFaces )
+	{		
+		int4 triangleIndexSet = g_triangleVertexIndexSet[ faceID ];
+		int nodeIndex0 = triangleIndexSet.x;
+		int nodeIndex1 = triangleIndexSet.y;
+		int nodeIndex2 = triangleIndexSet.z;
+
+		float4 node0 = g_vertexPositions[nodeIndex0];
+		float4 node1 = g_vertexPositions[nodeIndex1];
+		float4 node2 = g_vertexPositions[nodeIndex2];
+		float4 nodeNormal0 = g_vertexNormals[nodeIndex0];
+		float4 nodeNormal1 = g_vertexNormals[nodeIndex1];
+		float4 nodeNormal2 = g_vertexNormals[nodeIndex2];
+		float vertexArea0 = g_vertexArea[nodeIndex0];
+		float vertexArea1 = g_vertexArea[nodeIndex1];
+		float vertexArea2 = g_vertexArea[nodeIndex2];
+		
+		float4 vector0 = node1 - node0;
+		float4 vector1 = node2 - node0;
+		
+		float4 faceNormal = cross(vector0, vector1);
+		float triangleArea = length(faceNormal);
+
+		nodeNormal0 = nodeNormal0 + faceNormal;
+		nodeNormal1 = nodeNormal1 + faceNormal;
+		nodeNormal2 = nodeNormal2 + faceNormal;
+		vertexArea0 = vertexArea0 + triangleArea;
+		vertexArea1 = vertexArea1 + triangleArea;
+		vertexArea2 = vertexArea2 + triangleArea;
+		
+		g_triangleNormals[faceID] = normalize3(faceNormal);
+		g_vertexNormals[nodeIndex0] = nodeNormal0;
+		g_vertexNormals[nodeIndex1] = nodeNormal1;
+		g_vertexNormals[nodeIndex2] = nodeNormal2;
+		g_triangleArea[faceID] = triangleArea;
+		g_vertexArea[nodeIndex0] = vertexArea0;
+		g_vertexArea[nodeIndex1] = vertexArea1;
+		g_vertexArea[nodeIndex2] = vertexArea2;
+	}
+}
+
+__kernel void 
+NormalizeNormalsAndAreasKernel( 
+	const unsigned int numNodes,
+	__global int * g_vertexTriangleCount,
+	__global float4 * g_vertexNormals,
+	__global float * g_vertexArea GUID_ARG)
+{
+	if( get_global_id(0) < numNodes )
+	{
+		float4 normal = g_vertexNormals[get_global_id(0)];
+		float area = g_vertexArea[get_global_id(0)];
+		int numTriangles = g_vertexTriangleCount[get_global_id(0)];
+		
+		float vectorLength = length3(normal);
+		
+		g_vertexNormals[get_global_id(0)] = normalize3(normal);
+		g_vertexArea[get_global_id(0)] = area/(float)(numTriangles);
+	}
+}
+
+);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositions.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositions.cl
@@ -0,0 +1,34 @@
+MSTRINGIFY(
+
+__kernel void 
+updateVelocitiesFromPositionsWithoutVelocitiesKernel( 
+	const int numNodes,
+	const float isolverdt,
+	__global float4 * g_vertexPositions,
+	__global float4 * g_vertexPreviousPositions,
+	__global int * g_vertexClothIndices,
+	__global float * g_clothDampingFactor,
+	__global float4 * g_vertexVelocities,
+	__global float4 * g_vertexForces GUID_ARG)
+
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{	
+		float4 position = g_vertexPositions[nodeID];
+		float4 previousPosition = g_vertexPreviousPositions[nodeID];
+		float4 velocity = g_vertexVelocities[nodeID];
+		int clothIndex = g_vertexClothIndices[nodeID];
+		float dampingFactor = g_clothDampingFactor[clothIndex];
+		float velocityCoefficient = (1.f - dampingFactor);
+		
+		float4 difference = position - previousPosition;
+				
+		velocity = difference*velocityCoefficient*isolverdt;		
+		
+		g_vertexVelocities[nodeID] = velocity;
+		g_vertexForces[nodeID] = (float4)(0.f, 0.f, 0.f, 0.f);								
+	}
+}
+
+);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositionsFromVelocities.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositionsFromVelocities.cl
@@ -0,0 +1,28 @@
+
+MSTRINGIFY(
+
+
+
+
+__kernel void 
+UpdatePositionsFromVelocitiesKernel( 
+	const int numNodes,
+	const float solverSDT,
+	__global float4 * g_vertexVelocities,
+	__global float4 * g_vertexPreviousPositions,
+	__global float4 * g_vertexCurrentPosition GUID_ARG)
+{
+	int vertexID = get_global_id(0);
+	if( vertexID < numNodes )
+	{	
+		float4 previousPosition = g_vertexPreviousPositions[vertexID];
+		float4 velocity         = g_vertexVelocities[vertexID];
+		
+		float4 newPosition      = previousPosition + velocity*solverSDT;
+		
+		g_vertexCurrentPosition[vertexID]   = newPosition;
+		g_vertexPreviousPositions[vertexID] = newPosition;
+	}
+}
+
+);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/VSolveLinks.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/VSolveLinks.cl
@@ -0,0 +1,45 @@
+MSTRINGIFY(
+
+__kernel void 
+VSolveLinksKernel( 
+	int startLink,
+	int numLinks,
+	float kst,
+	__global int2 * g_linksVertexIndices,
+	__global float * g_linksLengthRatio,
+	__global float4 * g_linksCurrentLength,
+	__global float * g_vertexInverseMass,
+	__global float4 * g_vertexVelocity GUID_ARG)
+{
+	int linkID = get_global_id(0) + startLink;
+	if( get_global_id(0) < numLinks )
+	{		
+		int2 nodeIndices = g_linksVertexIndices[linkID];
+		int node0 = nodeIndices.x;
+		int node1 = nodeIndices.y;
+		
+		float linkLengthRatio = g_linksLengthRatio[linkID];
+		float3 linkCurrentLength = g_linksCurrentLength[linkID].xyz;
+		
+		float3 vertexVelocity0 = g_vertexVelocity[node0].xyz;
+		float3 vertexVelocity1 = g_vertexVelocity[node1].xyz;
+
+		float vertexInverseMass0 = g_vertexInverseMass[node0];
+		float vertexInverseMass1 = g_vertexInverseMass[node1]; 
+
+		float3 nodeDifference = vertexVelocity0 - vertexVelocity1;
+		float dotResult = dot(linkCurrentLength, nodeDifference);
+		float j = -dotResult*linkLengthRatio*kst;
+		
+		float3 velocityChange0 = linkCurrentLength*(j*vertexInverseMass0);
+		float3 velocityChange1 = linkCurrentLength*(j*vertexInverseMass1);
+		
+		vertexVelocity0 += velocityChange0;
+		vertexVelocity1 -= velocityChange1;
+
+		g_vertexVelocity[node0] = (float4)(vertexVelocity0, 0.f);
+		g_vertexVelocity[node1] = (float4)(vertexVelocity1, 0.f);
+	}
+}
+
+);
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h
@@ -17,7 +17,16 @@ subject to the following restrictions:
 #define BT_SOFT_BODY_SOLVER_BUFFER_OPENCL_H

 // OpenCL support
-#include <CL/cl.hpp>
+
+#ifdef USE_MINICL
+	#include "MiniCL/cl.h"
+#else //USE_MINICL
+	#ifdef __APPLE__
+		#include <OpenCL/OpenCL.h>
+	#else
+		#include <CL/cl.h>
+	#endif //__APPLE__
+#endif//USE_MINICL

 #ifndef SAFE_RELEASE
 #define SAFE_RELEASE(p)      { if(p) { (p)->Release(); (p)=NULL; } }
@@ -25,22 +34,25 @@ subject to the following restrictions:

 template <typename ElementType> class btOpenCLBuffer
 {
-protected:
-	cl::CommandQueue m_queue;
-	btAlignedObjectArray< ElementType > * m_CPUBuffer;
-	cl::Buffer m_buffer;
+public:

+	cl_command_queue	m_cqCommandQue;
+	cl_context			m_clContext;
+	cl_mem				m_buffer;
+
+
+
+	btAlignedObjectArray< ElementType > * m_CPUBuffer;
+	
 	int  m_gpuSize;
 	bool m_onGPU;
-
 	bool m_readOnlyOnGPU;
-
 	bool m_allocated;
-	// TODO: Remove this once C++ bindings are fixed
-	cl::Context context;

-	bool createBuffer( cl::Buffer *preexistingBuffer = 0)
+
+	bool createBuffer( cl_mem* preexistingBuffer = 0)
 	{
+
 		cl_int err;
 		 

@@ -49,12 +61,11 @@ protected:
 			m_buffer = *preexistingBuffer;
 		} 
 		else {
-			m_buffer = cl::Buffer(
-					context, 
-					m_readOnlyOnGPU ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE, 
-					m_CPUBuffer->size() * sizeof(ElementType), 
-					0, 
-					&err);
+
+			cl_mem_flags flags= m_readOnlyOnGPU ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
+
+			size_t size = m_CPUBuffer->size() * sizeof(ElementType);
+			m_buffer = clCreateBuffer(m_clContext, flags, size, 0, &err);
 			if( err != CL_SUCCESS )
 			{
 				btAssert( "Buffer::Buffer(m_buffer)");
@@ -62,35 +73,31 @@ protected:
 		}

 		m_gpuSize = m_CPUBuffer->size();
+
 		return true;
 	}

 public:
-	btOpenCLBuffer( 
-		cl::CommandQueue queue,
-		btAlignedObjectArray< ElementType > *CPUBuffer, 
-		bool readOnly) :
-		m_queue(queue),
+	btOpenCLBuffer( cl_command_queue	commandQue,cl_context ctx, btAlignedObjectArray< ElementType >* CPUBuffer, bool readOnly)
+		:m_cqCommandQue(commandQue),
+		m_clContext(ctx),
 		m_CPUBuffer(CPUBuffer),
 		m_gpuSize(0),
 		m_onGPU(false),
 		m_readOnlyOnGPU(readOnly),
 		m_allocated(false)
 	{
-		context = m_queue.getInfo<CL_QUEUE_CONTEXT>();
 	}

 	~btOpenCLBuffer()
 	{
 	}

-	cl::Buffer getBuffer()
-	{
-		return m_buffer;
-	}

 	bool moveToGPU()
 	{
+
+
 		cl_int err;

 		if( (m_CPUBuffer->size() != m_gpuSize) )
@@ -107,12 +114,12 @@ public:
 				m_allocated = true;
 			}
 			
-			err = m_queue.enqueueWriteBuffer(
-				m_buffer,
+			size_t size = m_CPUBuffer->size() * sizeof(ElementType);
+			err = clEnqueueWriteBuffer(m_cqCommandQue,m_buffer,
 				CL_FALSE,
 				0,
-				m_CPUBuffer->size() * sizeof(ElementType), 
-				&((*m_CPUBuffer)[0]));
+				size, 
+				&((*m_CPUBuffer)[0]),0,0,0);
 			if( err != CL_SUCCESS )
 			{
 				btAssert( "CommandQueue::enqueueWriteBuffer(m_buffer)" );
@@ -122,20 +129,23 @@ public:
 		}

 		return true;
+
 	}

 	bool moveFromGPU()
 	{
+
 		cl_int err;

 		if (m_CPUBuffer->size() > 0) {
 			if (m_onGPU && !m_readOnlyOnGPU) {
-				err = m_queue.enqueueReadBuffer(
+				size_t size = m_CPUBuffer->size() * sizeof(ElementType);
+				err = clEnqueueReadBuffer(m_cqCommandQue,
 					m_buffer,
 					CL_TRUE,
 					0,
-					m_CPUBuffer->size() * sizeof(ElementType), 
-					&((*m_CPUBuffer)[0]));
+					size,
+					&((*m_CPUBuffer)[0]),0,0,0);

 				if( err != CL_SUCCESS )
 				{
@@ -151,16 +161,17 @@ public:

 	bool copyFromGPU()
 	{
+
 		cl_int err;
+		size_t size = m_CPUBuffer->size() * sizeof(ElementType);

 		if (m_CPUBuffer->size() > 0) {
 			if (m_onGPU && !m_readOnlyOnGPU) {
-				err = m_queue.enqueueReadBuffer(
+				err = clEnqueueReadBuffer(m_cqCommandQue,
 					m_buffer,
 					CL_TRUE,
-					0,
-					m_CPUBuffer->size() * sizeof(ElementType), 
-					&((*m_CPUBuffer)[0]));
+					0,size, 
+					&((*m_CPUBuffer)[0]),0,0,0);

 				if( err != CL_SUCCESS )
 				{
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h
@@ -13,8 +13,8 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */

-#include "BulletSoftBody/Solvers/CPU/btSoftBodySolverData.h"
-#include "BulletSoftBody/Solvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h"
+#include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"


 #ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_H
@@ -25,7 +25,9 @@ class btSoftBodyLinkDataOpenCL : public btSoftBodyLinkData
 {
 public:
 	bool				m_onGPU;
-	cl::CommandQueue	m_queue;
+
+	cl_command_queue	m_cqCommandQue;
+

 	btOpenCLBuffer<LinkNodePair> m_clLinks;
 	btOpenCLBuffer<float>							      m_clLinkStrength;
@@ -36,6 +38,24 @@ public:
 	btOpenCLBuffer<float>								  m_clLinksRestLength;
 	btOpenCLBuffer<float>								  m_clLinksMaterialLinearStiffnessCoefficient;

+	struct BatchPair
+	{
+		int start;
+		int length;
+
+		BatchPair() :
+			start(0),
+			length(0)
+		{
+		}
+
+		BatchPair( int s, int l ) : 
+			start( s ),
+			length( l )
+		{
+		}
+	};
+
 	/**
 	 * Link addressing information for each cloth.
 	 * Allows link locations to be computed independently of data batching.
@@ -45,9 +65,9 @@ public:
 	/**
 	 * Start and length values for computation batches over link data.
 	 */
-	btAlignedObjectArray< std::pair< int, int > >		m_batchStartLengths;
+	btAlignedObjectArray< BatchPair >		m_batchStartLengths;

-	btSoftBodyLinkDataOpenCL(cl::CommandQueue queue);
+	btSoftBodyLinkDataOpenCL(cl_command_queue queue, cl_context ctx);

 	virtual ~btSoftBodyLinkDataOpenCL();

--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h
@@ -14,8 +14,8 @@ subject to the following restrictions:
 */


-#include "BulletSoftBody/Solvers/CPU/btSoftBodySolverData.h"
-#include "BulletSoftBody/Solvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h"
+#include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"


 #ifndef BT_SOFT_BODY_SOLVER_TRIANGLE_DATA_OPENCL_H
@@ -26,7 +26,7 @@ class btSoftBodyTriangleDataOpenCL : public btSoftBodyTriangleData
 {
 public:
 	bool				m_onGPU;
-	cl::CommandQueue    m_queue;
+	cl_command_queue    m_queue;

 	btOpenCLBuffer<btSoftBodyTriangleData::TriangleNodeSet>					m_clVertexIndices;
 	btOpenCLBuffer<float>								m_clArea;
@@ -41,10 +41,20 @@ public:
 	/**
 	 * Start and length values for computation batches over link data.
 	 */
-	btAlignedObjectArray< std::pair< int, int > >		m_batchStartLengths;
+	struct btSomePair
+	{
+		btSomePair() {}
+		btSomePair(int f,int s)
+			:first(f),second(s)
+		{
+		}
+		int first;
+		int second;
+	};
+	btAlignedObjectArray< btSomePair >		m_batchStartLengths;

 public:
-	btSoftBodyTriangleDataOpenCL( cl::CommandQueue queue );
+	btSoftBodyTriangleDataOpenCL( cl_command_queue queue, cl_context ctx );

 	virtual ~btSoftBodyTriangleDataOpenCL();

--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
@@ -13,8 +13,8 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */

-#include "BulletSoftBody/Solvers/CPU/btSoftBodySolverData.h"
-#include "BulletSoftBody/Solvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h"
+#include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"

 #ifndef BT_SOFT_BODY_SOLVER_VERTEX_DATA_OPENCL_H
 #define BT_SOFT_BODY_SOLVER_VERTEX_DATA_OPENCL_H
@@ -24,7 +24,7 @@ class btSoftBodyVertexDataOpenCL : public btSoftBodyVertexData
 {
 protected:
 	bool		m_onGPU;
-	cl::CommandQueue m_queue;
+	cl_command_queue	m_queue;

 public:
 	btOpenCLBuffer<int>									m_clClothIdentifier;
@@ -37,7 +37,7 @@ public:
 	btOpenCLBuffer<float>									m_clVertexArea;
 	btOpenCLBuffer<int>									m_clVertexTriangleCount;
 public:
-	btSoftBodyVertexDataOpenCL( cl::CommandQueue queue);
+	btSoftBodyVertexDataOpenCL( cl_command_queue queue,  cl_context ctx);

 	virtual ~btSoftBodyVertexDataOpenCL();

--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
@@ -16,10 +16,18 @@ subject to the following restrictions:

 #include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
 #include "vectormath/vmInclude.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolver_OpenCL.h"
-#include "BulletSoftBody/VertexBuffers/btSoftBodySolverVertexBuffer.h"
+#include <stdio.h> //@todo: remove the debugging printf at some stage
+#include "btSoftBodySolver_OpenCL.h"
+#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
 #include "BulletSoftBody/btSoftBody.h"

+     static const size_t workGroupSize = 128;
+
+
+//CL_VERSION_1_1 seems broken on NVidia SDK so just disable it
+
+#if (0)//CL_VERSION_1_1 == 1)
+ //OpenCL 1.1 kernels use float3
 #define MSTRINGIFY(A) #A
 static char* PrepareLinksCLString = 
 #include "OpenCLC/PrepareLinks.cl"
@@ -41,19 +49,43 @@ static char* UpdateNormalsCLString =
 #include "OpenCLC/UpdateNormals.cl"
 static char* VSolveLinksCLString = 
 #include "OpenCLC/VSolveLinks.cl"
+#else
+////OpenCL 1.0 kernels don't use float3
+#define MSTRINGIFY(A) #A
+static char* PrepareLinksCLString = 
+#include "OpenCLC10/PrepareLinks.cl"
+static char* UpdatePositionsFromVelocitiesCLString = 
+#include "OpenCLC10/UpdatePositionsFromVelocities.cl"
+static char* SolvePositionsCLString = 
+#include "OpenCLC10/SolvePositions.cl"
+static char* UpdateNodesCLString = 
+#include "OpenCLC10/UpdateNodes.cl"
+static char* UpdatePositionsCLString = 
+#include "OpenCLC10/UpdatePositions.cl"
+static char* UpdateConstantsCLString = 
+#include "OpenCLC10/UpdateConstants.cl"
+static char* IntegrateCLString = 
+#include "OpenCLC10/Integrate.cl"
+static char* ApplyForcesCLString = 
+#include "OpenCLC10/ApplyForces.cl"
+static char* UpdateNormalsCLString = 
+#include "OpenCLC10/UpdateNormals.cl"
+static char* VSolveLinksCLString = 
+#include "OpenCLC10/VSolveLinks.cl"
+#endif //CL_VERSION_1_1


-btSoftBodyVertexDataOpenCL::btSoftBodyVertexDataOpenCL( cl::CommandQueue queue) :
+btSoftBodyVertexDataOpenCL::btSoftBodyVertexDataOpenCL( cl_command_queue queue, cl_context ctx) :
    m_queue(queue),
-	m_clClothIdentifier( queue, &m_clothIdentifier, false ),
-	m_clVertexPosition( queue, &m_vertexPosition, false ),
-	m_clVertexPreviousPosition( queue, &m_vertexPreviousPosition, false ),
-	m_clVertexVelocity( queue, &m_vertexVelocity, false ),
-	m_clVertexForceAccumulator( queue, &m_vertexForceAccumulator, false ),
-	m_clVertexNormal( queue, &m_vertexNormal, false ),
-	m_clVertexInverseMass( queue, &m_vertexInverseMass, false ),
-	m_clVertexArea( queue, &m_vertexArea, false ),
-	m_clVertexTriangleCount( queue, &m_vertexTriangleCount, false )
+	m_clClothIdentifier( queue, ctx, &m_clothIdentifier, false ),
+	m_clVertexPosition( queue, ctx, &m_vertexPosition, false ),
+	m_clVertexPreviousPosition( queue, ctx, &m_vertexPreviousPosition, false ),
+	m_clVertexVelocity( queue, ctx, &m_vertexVelocity, false ),
+	m_clVertexForceAccumulator( queue, ctx, &m_vertexForceAccumulator, false ),
+	m_clVertexNormal( queue, ctx, &m_vertexNormal, false ),
+	m_clVertexInverseMass( queue, ctx, &m_vertexInverseMass, false ),
+	m_clVertexArea( queue, ctx, &m_vertexArea, false ),
+	m_clVertexTriangleCount( queue, ctx, &m_vertexTriangleCount, false )
 {
 }

@@ -108,16 +140,16 @@ bool btSoftBodyVertexDataOpenCL::moveFromAccelerator()



-btSoftBodyLinkDataOpenCL::btSoftBodyLinkDataOpenCL(cl::CommandQueue queue) :
-    m_queue(queue),
-	m_clLinks( queue, &m_links, false ),
-	m_clLinkStrength( queue, &m_linkStrength, false ),
-	m_clLinksMassLSC( queue, &m_linksMassLSC, false ),
-	m_clLinksRestLengthSquared( queue, &m_linksRestLengthSquared, false ),
-	m_clLinksCLength( queue, &m_linksCLength, false ),
-	m_clLinksLengthRatio( queue, &m_linksLengthRatio, false ),
-	m_clLinksRestLength( queue, &m_linksRestLength, false ),
-	m_clLinksMaterialLinearStiffnessCoefficient( queue, &m_linksMaterialLinearStiffnessCoefficient, false )
+btSoftBodyLinkDataOpenCL::btSoftBodyLinkDataOpenCL(cl_command_queue queue,  cl_context ctx) 
+:m_cqCommandQue(queue),
+	m_clLinks( queue, ctx, &m_links, false ),
+	m_clLinkStrength( queue, ctx, &m_linkStrength, false ),
+	m_clLinksMassLSC( queue, ctx, &m_linksMassLSC, false ),
+	m_clLinksRestLengthSquared( queue, ctx, &m_linksRestLengthSquared, false ),
+	m_clLinksCLength( queue, ctx, &m_linksCLength, false ),
+	m_clLinksLengthRatio( queue, ctx, &m_linksLengthRatio, false ),
+	m_clLinksRestLength( queue, ctx, &m_linksRestLength, false ),
+	m_clLinksMaterialLinearStiffnessCoefficient( queue, ctx, &m_linksMaterialLinearStiffnessCoefficient, false )
 {
 }

@@ -272,13 +304,13 @@ void btSoftBodyLinkDataOpenCL::generateBatches()
 	if( m_batchStartLengths.size() > 0 )
 	{
 		m_batchStartLengths.resize(batchCounts.size());
-		m_batchStartLengths[0] = std::pair< int, int >( 0, 0 );
+		m_batchStartLengths[0] = BatchPair(0, 0);

 		int sum = 0;
 		for( int batchIndex = 0; batchIndex < batchCounts.size(); ++batchIndex )
 		{
-			m_batchStartLengths[batchIndex].first = sum;
-			m_batchStartLengths[batchIndex].second = batchCounts[batchIndex];
+			m_batchStartLengths[batchIndex].start = sum;
+			m_batchStartLengths[batchIndex].length = batchCounts[batchIndex];
 			sum += batchCounts[batchIndex];
 		}
 	}
@@ -313,7 +345,7 @@ void btSoftBodyLinkDataOpenCL::generateBatches()
 		// next element in that batch, incrementing the batch counter
 		// afterwards
 		int batch = batchValues[linkIndex];
-		int newLocation = m_batchStartLengths[batch].first + batchCounts[batch];
+		int newLocation = m_batchStartLengths[batch].start + batchCounts[batch];

 		batchCounts[batch] = batchCounts[batch] + 1;
 		m_links[newLocation] = m_links_Backup[linkLocation];
@@ -336,11 +368,11 @@ void btSoftBodyLinkDataOpenCL::generateBatches()



-btSoftBodyTriangleDataOpenCL::btSoftBodyTriangleDataOpenCL( cl::CommandQueue queue ) : 
+btSoftBodyTriangleDataOpenCL::btSoftBodyTriangleDataOpenCL( cl_command_queue queue , cl_context ctx) : 
    m_queue( queue ),
-	m_clVertexIndices( queue, &m_vertexIndices, false ),
-	m_clArea( queue, &m_area, false ),
-	m_clNormal( queue, &m_normal, false )
+	m_clVertexIndices( queue, ctx, &m_vertexIndices, false ),
+	m_clArea( queue, ctx, &m_area, false ),
+	m_clNormal( queue, ctx, &m_normal, false )
 {
 }

@@ -493,7 +525,7 @@ void btSoftBodyTriangleDataOpenCL::generateBatches()


 	m_batchStartLengths.resize(batchCounts.size());
-	m_batchStartLengths[0] = std::pair< int, int >( 0, 0 );
+	m_batchStartLengths[0] = btSomePair(0,0);


 	int sum = 0;
@@ -547,18 +579,19 @@ void btSoftBodyTriangleDataOpenCL::generateBatches()



-btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(const cl::CommandQueue &queue) :
-	m_linkData(queue),
-	m_vertexData(queue),
-	m_triangleData(queue),
-	m_clPerClothAcceleration(queue, &m_perClothAcceleration, true ),
-	m_clPerClothWindVelocity(queue, &m_perClothWindVelocity, true ),
-	m_clPerClothDampingFactor(queue, &m_perClothDampingFactor, true ),
-	m_clPerClothVelocityCorrectionCoefficient(queue, &m_perClothVelocityCorrectionCoefficient, true ),
-	m_clPerClothLiftFactor(queue, &m_perClothLiftFactor, true ),
-	m_clPerClothDragFactor(queue, &m_perClothDragFactor, true ),
-	m_clPerClothMediumDensity(queue, &m_perClothMediumDensity, true ),
-	m_queue( queue )
+btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_context ctx) :
+	m_linkData(queue, ctx),
+	m_vertexData(queue, ctx),
+	m_triangleData(queue, ctx),
+	m_clPerClothAcceleration(queue, ctx, &m_perClothAcceleration, true ),
+	m_clPerClothWindVelocity(queue, ctx, &m_perClothWindVelocity, true ),
+	m_clPerClothDampingFactor(queue,ctx, &m_perClothDampingFactor, true ),
+	m_clPerClothVelocityCorrectionCoefficient(queue, ctx,&m_perClothVelocityCorrectionCoefficient, true ),
+	m_clPerClothLiftFactor(queue, ctx,&m_perClothLiftFactor, true ),
+	m_clPerClothDragFactor(queue, ctx,&m_perClothDragFactor, true ),
+	m_clPerClothMediumDensity(queue, ctx,&m_perClothMediumDensity, true ),
+	m_cqCommandQue( queue ),
+	m_cxMainContext(ctx)
 {
 	// Initial we will clearly need to update solver constants
 	// For now this is global for the cloths linked with this solver - we should probably make this body specific 
@@ -590,7 +623,7 @@ void btOpenCLSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &sof
 			using Vectormath::Aos::Point3;

 			// Create SoftBody that will store the information within the solver
-			btAcceleratedSoftBodyInterface *newSoftBody = new btAcceleratedSoftBodyInterface( softBody );
+			btOpenCLAcceleratedSoftBodyInterface *newSoftBody = new btOpenCLAcceleratedSoftBodyInterface( softBody );
 			m_softBodySet.push_back( newSoftBody );

 			m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) );
@@ -712,51 +745,58 @@ bool btOpenCLSoftBodySolver::checkInitialized()

 void btOpenCLSoftBodySolver::resetNormalsAndAreas( int numVertices )
 {
-	resetNormalsAndAreasKernel.kernel.setArg(0, numVertices);
-	resetNormalsAndAreasKernel.kernel.setArg(1, m_vertexData.m_clVertexNormal.getBuffer());
-	resetNormalsAndAreasKernel.kernel.setArg(2, m_vertexData.m_clVertexArea.getBuffer());
+	cl_int ciErrNum;
+	ciErrNum = clSetKernelArg(resetNormalsAndAreasKernel, 0, sizeof(numVertices), (void*)&numVertices); //oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	ciErrNum = clSetKernelArg(resetNormalsAndAreasKernel, 1, sizeof(cl_mem), (void*)&m_vertexData.m_clVertexNormal.m_buffer);//oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	ciErrNum = clSetKernelArg(resetNormalsAndAreasKernel,  2, sizeof(cl_mem), (void*)&m_vertexData.m_clVertexArea.m_buffer); //oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	size_t numWorkItems = workGroupSize*((numVertices + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, resetNormalsAndAreasKernel, 1, NULL, &numWorkItems, &workGroupSize, 0,0,0 );

-	int	numWorkItems = workGroupSize*((numVertices + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(resetNormalsAndAreasKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS )
+	if( ciErrNum != CL_SUCCESS )
 	{
-		btAssert( "enqueueNDRangeKernel(resetNormalsAndAreasKernel)" );
+		btAssert( 0 && "enqueueNDRangeKernel(resetNormalsAndAreasKernel)" );
 	}
+
 }

 void btOpenCLSoftBodySolver::normalizeNormalsAndAreas( int numVertices )
 {
-	normalizeNormalsAndAreasKernel.kernel.setArg(0, numVertices);
-	normalizeNormalsAndAreasKernel.kernel.setArg(1, m_vertexData.m_clVertexTriangleCount.getBuffer());
-	normalizeNormalsAndAreasKernel.kernel.setArg(2, m_vertexData.m_clVertexNormal.getBuffer());
-	normalizeNormalsAndAreasKernel.kernel.setArg(3, m_vertexData.m_clVertexArea.getBuffer());

-	int	numWorkItems = workGroupSize*((numVertices + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(normalizeNormalsAndAreasKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
+	cl_int ciErrNum;
+
+	ciErrNum = clSetKernelArg(normalizeNormalsAndAreasKernel, 0, sizeof(int),(void*) &numVertices);
+	ciErrNum = clSetKernelArg(normalizeNormalsAndAreasKernel, 1, sizeof(cl_mem), &m_vertexData.m_clVertexTriangleCount.m_buffer);
+	ciErrNum = clSetKernelArg(normalizeNormalsAndAreasKernel, 2, sizeof(cl_mem), &m_vertexData.m_clVertexNormal.m_buffer);
+	ciErrNum = clSetKernelArg(normalizeNormalsAndAreasKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexArea.m_buffer);
+	size_t	numWorkItems = workGroupSize*((numVertices + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, normalizeNormalsAndAreasKernel, 1, NULL, &numWorkItems, &workGroupSize, 0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert( "enqueueNDRangeKernel(normalizeNormalsAndAreasKernel)");
+		btAssert( 0 && "enqueueNDRangeKernel(normalizeNormalsAndAreasKernel)");
 	}
+
 }

 void btOpenCLSoftBodySolver::executeUpdateSoftBodies( int firstTriangle, int numTriangles )
 {
-	updateSoftBodiesKernel.kernel.setArg(0, firstTriangle);
-	updateSoftBodiesKernel.kernel.setArg(1, numTriangles);
-	updateSoftBodiesKernel.kernel.setArg(2, m_triangleData.m_clVertexIndices.getBuffer());
-	updateSoftBodiesKernel.kernel.setArg(3, m_vertexData.m_clVertexPosition.getBuffer());
-	updateSoftBodiesKernel.kernel.setArg(4, m_vertexData.m_clVertexNormal.getBuffer());
-	updateSoftBodiesKernel.kernel.setArg(5, m_vertexData.m_clVertexArea.getBuffer());
-	updateSoftBodiesKernel.kernel.setArg(6, m_triangleData.m_clNormal.getBuffer());
-	updateSoftBodiesKernel.kernel.setArg(7, m_triangleData.m_clArea.getBuffer());

+	cl_int ciErrNum;
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 0, sizeof(int), (void*) &firstTriangle);
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 1, sizeof(int), &numTriangles);
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 2, sizeof(cl_mem), &m_triangleData.m_clVertexIndices.m_buffer);
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 4, sizeof(cl_mem), &m_vertexData.m_clVertexNormal.m_buffer);
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexArea.m_buffer);
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 6, sizeof(cl_mem), &m_triangleData.m_clNormal.m_buffer);
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 7, sizeof(cl_mem), &m_triangleData.m_clArea.m_buffer);

-	int	numWorkItems = workGroupSize*((numTriangles + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(updateSoftBodiesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
+	size_t numWorkItems = workGroupSize*((numTriangles + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, updateSoftBodiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(normalizeNormalsAndAreasKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(normalizeNormalsAndAreasKernel)");
 	}
+
 }

 void btOpenCLSoftBodySolver::updateSoftBodies()
@@ -807,6 +847,7 @@ void btOpenCLSoftBodySolver::ApplyClampedForce( float solverdt, const Vectormath

 void btOpenCLSoftBodySolver::applyForces( float solverdt )
 {	
+
 	// Ensure data is on accelerator
 	m_vertexData.moveToAccelerator();
 	m_clPerClothAcceleration.moveToGPU();
@@ -815,85 +856,30 @@ void btOpenCLSoftBodySolver::applyForces( float solverdt )
 	m_clPerClothMediumDensity.moveToGPU();
 	m_clPerClothWindVelocity.moveToGPU();			

-	cl_int err;
-	err = applyForcesKernel.kernel.setArg(0, m_vertexData.getNumVertices());
-	if( err != CL_SUCCESS ) 
+	cl_int ciErrNum ;
+	int numVerts = m_vertexData.getNumVertices();
+	ciErrNum = clSetKernelArg(applyForcesKernel, 0, sizeof(int), &numVerts);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 1, sizeof(float), &solverdt);
+	float fl = FLT_EPSILON;
+	ciErrNum = clSetKernelArg(applyForcesKernel, 2, sizeof(float), &fl);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 3, sizeof(cl_mem), &m_vertexData.m_clClothIdentifier.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 4, sizeof(cl_mem), &m_vertexData.m_clVertexNormal.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexArea.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 6, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 7, sizeof(cl_mem), &m_clPerClothLiftFactor.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 8 ,sizeof(cl_mem), &m_clPerClothDragFactor.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 9, sizeof(cl_mem), &m_clPerClothWindVelocity.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel,10, sizeof(cl_mem), &m_clPerClothAcceleration.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel,11, sizeof(cl_mem), &m_clPerClothMediumDensity.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel,12, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel,13, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
+	size_t numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,applyForcesKernel, 1, NULL, &numWorkItems, &workGroupSize, 0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(1, solverdt);
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(2, FLT_EPSILON);
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(3, m_vertexData.m_clClothIdentifier.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(4, m_vertexData.m_clVertexNormal.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(5, m_vertexData.m_clVertexArea.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(6, m_vertexData.m_clVertexInverseMass.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(7, m_clPerClothLiftFactor.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(8, m_clPerClothDragFactor.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(9, m_clPerClothWindVelocity.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(10, m_clPerClothAcceleration.getBuffer());
-	if( err != CL_SUCCESS )
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(11, m_clPerClothMediumDensity.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(12, m_vertexData.m_clVertexForceAccumulator.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(13, m_vertexData.m_clVertexVelocity.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(applyForcesKernel)");
 	}

-	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
-
-	err = m_queue.enqueueNDRangeKernel(applyForcesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
 }

 /**
@@ -901,22 +887,26 @@ void btOpenCLSoftBodySolver::applyForces( float solverdt )
 */
 void btOpenCLSoftBodySolver::integrate( float solverdt )
 {
+	
+
 	// Ensure data is on accelerator
 	m_vertexData.moveToAccelerator();

-	integrateKernel.kernel.setArg(0, m_vertexData.getNumVertices());
-	integrateKernel.kernel.setArg(1, solverdt);
-	integrateKernel.kernel.setArg(2, m_vertexData.m_clVertexInverseMass.getBuffer());
-	integrateKernel.kernel.setArg(3, m_vertexData.m_clVertexPosition.getBuffer());
-	integrateKernel.kernel.setArg(4, m_vertexData.m_clVertexVelocity.getBuffer());
-	integrateKernel.kernel.setArg(5, m_vertexData.m_clVertexPreviousPosition.getBuffer());
-	integrateKernel.kernel.setArg(6, m_vertexData.m_clVertexForceAccumulator.getBuffer());
+	cl_int ciErrNum;
+	int numVerts = m_vertexData.getNumVertices();
+	ciErrNum = clSetKernelArg(integrateKernel, 0, sizeof(int), &numVerts);
+	ciErrNum = clSetKernelArg(integrateKernel, 1, sizeof(float), &solverdt);
+	ciErrNum = clSetKernelArg(integrateKernel, 2, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
+	ciErrNum = clSetKernelArg(integrateKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
+	ciErrNum = clSetKernelArg(integrateKernel, 4, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
+	ciErrNum = clSetKernelArg(integrateKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
+	ciErrNum = clSetKernelArg(integrateKernel, 6, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);

-	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(integrateKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS )
+	size_t numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,integrateKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
+	if( ciErrNum != CL_SUCCESS )
 	{
-		btAssert(  "enqueueNDRangeKernel(integrateKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(integrateKernel)");
 	}

 }
@@ -935,6 +925,7 @@ float btOpenCLSoftBodySolver::computeTriangleArea(

 void btOpenCLSoftBodySolver::updateConstants( float timeStep )
 {			
+
 	using namespace Vectormath::Aos;

 	if( m_updateSolverConstants )
@@ -959,10 +950,12 @@ void btOpenCLSoftBodySolver::updateConstants( float timeStep )
 			m_linkData.getRestLengthSquared(linkIndex) = restLengthSquared;
 		}
 	}
+
 }

 void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 {
+
 	using Vectormath::Aos::Vector3;
 	using Vectormath::Aos::Point3;
 	using Vectormath::Aos::lengthSqr;
@@ -988,33 +981,34 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )



-	// Prepare anchors
-	/*for(i=0,ni=m_anchors.size();i<ni;++i)
+	for( int iteration = 0; iteration < m_numberOfVelocityIterations ; ++iteration )
 	{
-		Anchor&			a=m_anchors[i];
-		const btVector3	ra=a.m_body->getWorldTransform().getBasis()*a.m_local;
-		a.m_c0	=	ImpulseMatrix(	m_sst.sdt,
-			a.m_node->m_im,
-			a.m_body->getInvMass(),
-			a.m_body->getInvInertiaTensorWorld(),
-			ra);
-		a.m_c1	=	ra;
-		a.m_c2	=	m_sst.sdt*a.m_node->m_im;
-		a.m_body->activate();
-	}*/
+		for( int i = 0; i < m_linkData.m_batchStartLengths.size(); ++i )
+		{
+			int startLink = m_linkData.m_batchStartLengths[i].start;
+			int numLinks = m_linkData.m_batchStartLengths[i].length;

-	// Really want to combine these into a single loop, don't we? No update in the middle?
-
-	// TODO: Double check what kst is meant to mean - passed in as 1 in the bullet code
+			solveLinksForVelocity( startLink, numLinks, kst );
+		}
+	}

+	// Compute new positions from velocity
+	// Also update the previous position so that our position computation is now based on the new position from the velocity solution
+	// rather than based directly on the original positions
+	if( m_numberOfVelocityIterations > 0 )
+	{
+		updateVelocitiesFromPositionsWithVelocities( 1.f/solverdt );
+	} else {
+		updateVelocitiesFromPositionsWithoutVelocities( 1.f/solverdt );
+	}

 	// Solve drift
 	for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
 	{
 		for( int i = 0; i < m_linkData.m_batchStartLengths.size(); ++i )
 		{
-			int startLink = m_linkData.m_batchStartLengths[i].first;
-			int numLinks = m_linkData.m_batchStartLengths[i].second;
+			int startLink = m_linkData.m_batchStartLengths[i].start;
+			int numLinks = m_linkData.m_batchStartLengths[i].length;

 			solveLinksForPosition( startLink, numLinks, kst, ti );
 		}
@@ -1023,6 +1017,7 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )


 	updateVelocitiesFromPositionsWithoutVelocities( 1.f/solverdt );
+
 }


@@ -1030,96 +1025,136 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 // Kernel dispatches
 void btOpenCLSoftBodySolver::prepareLinks()
 {
-	prepareLinksKernel.kernel.setArg(0, m_linkData.getNumLinks());
-	prepareLinksKernel.kernel.setArg(1, m_linkData.m_clLinks.getBuffer());
-	prepareLinksKernel.kernel.setArg(2, m_linkData.m_clLinksMassLSC.getBuffer());
-	prepareLinksKernel.kernel.setArg(3, m_vertexData.m_clVertexPreviousPosition.getBuffer());
-	prepareLinksKernel.kernel.setArg(4, m_linkData.m_clLinksLengthRatio.getBuffer());
-	prepareLinksKernel.kernel.setArg(5, m_linkData.m_clLinksCLength.getBuffer());

-	int	numWorkItems = workGroupSize*((m_linkData.getNumLinks() + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(prepareLinksKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
+	cl_int ciErrNum;
+	int numLinks = m_linkData.getNumLinks();
+	ciErrNum = clSetKernelArg(prepareLinksKernel,0, sizeof(int), &numLinks);
+	ciErrNum = clSetKernelArg(prepareLinksKernel,1, sizeof(cl_mem), &m_linkData.m_clLinks.m_buffer);
+	ciErrNum = clSetKernelArg(prepareLinksKernel,2, sizeof(cl_mem), &m_linkData.m_clLinksMassLSC.m_buffer);
+	ciErrNum = clSetKernelArg(prepareLinksKernel,3, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
+	ciErrNum = clSetKernelArg(prepareLinksKernel,4, sizeof(cl_mem), &m_linkData.m_clLinksLengthRatio.m_buffer);
+	ciErrNum = clSetKernelArg(prepareLinksKernel,5, sizeof(cl_mem), &m_linkData.m_clLinksCLength.m_buffer);
+
+	size_t	numWorkItems = workGroupSize*((m_linkData.getNumLinks() + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,prepareLinksKernel, 1 , NULL, &numWorkItems, &workGroupSize,0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(prepareLinksKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(prepareLinksKernel)");
 	}
+
 }

 void btOpenCLSoftBodySolver::updatePositionsFromVelocities( float solverdt )
 {
-	updatePositionsFromVelocitiesKernel.kernel.setArg(0, m_vertexData.getNumVertices());
-	updatePositionsFromVelocitiesKernel.kernel.setArg(1, solverdt);
-	updatePositionsFromVelocitiesKernel.kernel.setArg(2, m_vertexData.m_clVertexVelocity.getBuffer());
-	updatePositionsFromVelocitiesKernel.kernel.setArg(3, m_vertexData.m_clVertexPreviousPosition.getBuffer());
-	updatePositionsFromVelocitiesKernel.kernel.setArg(4, m_vertexData.m_clVertexPosition.getBuffer());

-	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(updatePositionsFromVelocitiesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
+	cl_int ciErrNum;
+	int numVerts = m_vertexData.getNumVertices();
+	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,0, sizeof(int), &numVerts);
+	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,1, sizeof(float), &solverdt);
+	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,2, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
+	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,3, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
+	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,4, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
+
+	size_t	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,updatePositionsFromVelocitiesKernel, 1, NULL, &numWorkItems,&workGroupSize,0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(updatePositionsFromVelocitiesKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(updatePositionsFromVelocitiesKernel)");
 	}
+
 }

 void btOpenCLSoftBodySolver::solveLinksForPosition( int startLink, int numLinks, float kst, float ti )
 {
-	solvePositionsFromLinksKernel.kernel.setArg(0, startLink);
-	solvePositionsFromLinksKernel.kernel.setArg(1, numLinks);
-	solvePositionsFromLinksKernel.kernel.setArg(2, kst);
-	solvePositionsFromLinksKernel.kernel.setArg(3, ti);
-	solvePositionsFromLinksKernel.kernel.setArg(4, m_linkData.m_clLinks.getBuffer());
-	solvePositionsFromLinksKernel.kernel.setArg(5, m_linkData.m_clLinksMassLSC.getBuffer());
-	solvePositionsFromLinksKernel.kernel.setArg(6, m_linkData.m_clLinksRestLengthSquared.getBuffer());
-	solvePositionsFromLinksKernel.kernel.setArg(7, m_vertexData.m_clVertexInverseMass.getBuffer());
-	solvePositionsFromLinksKernel.kernel.setArg(8, m_vertexData.m_clVertexPosition.getBuffer());

-	int	numWorkItems = workGroupSize*((numLinks + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(solvePositionsFromLinksKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
+	cl_int ciErrNum;
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,0, sizeof(int), &startLink);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,1, sizeof(int), &numLinks);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,2, sizeof(float), &kst);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,3, sizeof(float), &ti);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,4, sizeof(cl_mem), &m_linkData.m_clLinks.m_buffer);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,5, sizeof(cl_mem), &m_linkData.m_clLinksMassLSC.m_buffer);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,6, sizeof(cl_mem), &m_linkData.m_clLinksRestLengthSquared.m_buffer);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,7, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,8, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
+
+	size_t	numWorkItems = workGroupSize*((numLinks + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,solvePositionsFromLinksKernel,1,NULL,&numWorkItems,&workGroupSize,0,0,0);
+	if( ciErrNum!= CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(solvePositionsFromLinksKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(solvePositionsFromLinksKernel)");
 	}
+
 } // solveLinksForPosition


+void btOpenCLSoftBodySolver::solveLinksForVelocity( int startLink, int numLinks, float kst )
+{
+
+	cl_int ciErrNum;
+	ciErrNum = clSetKernelArg(vSolveLinksKernel, 0, sizeof(int), &startLink);
+	ciErrNum = clSetKernelArg(vSolveLinksKernel, 1, sizeof(int), &numLinks);
+	ciErrNum = clSetKernelArg(vSolveLinksKernel, 2, sizeof(cl_mem), &m_linkData.m_clLinks.m_buffer);
+	ciErrNum = clSetKernelArg(vSolveLinksKernel, 3, sizeof(cl_mem), &m_linkData.m_clLinksLengthRatio.m_buffer);
+	ciErrNum = clSetKernelArg(vSolveLinksKernel, 4, sizeof(cl_mem), &m_linkData.m_clLinksCLength.m_buffer);
+	ciErrNum = clSetKernelArg(vSolveLinksKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
+	ciErrNum = clSetKernelArg(vSolveLinksKernel, 6, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
+
+	size_t	numWorkItems = workGroupSize*((numLinks + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,vSolveLinksKernel,1,NULL,&numWorkItems, &workGroupSize,0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
+	{
+		btAssert( 0 &&  "enqueueNDRangeKernel(vSolveLinksKernel)");
+	}
+
+}
+
 void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithVelocities( float isolverdt )
 {
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(0, m_vertexData.getNumVertices());
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(1, isolverdt);
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(2, m_vertexData.m_clVertexPosition.getBuffer());
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(3, m_vertexData.m_clVertexPreviousPosition.getBuffer());
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(4, m_vertexData.m_clClothIdentifier.getBuffer());
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(5, m_clPerClothVelocityCorrectionCoefficient.getBuffer());
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(6, m_clPerClothDampingFactor.getBuffer());
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(7, m_vertexData.m_clVertexVelocity.getBuffer());
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(8, m_vertexData.m_clVertexForceAccumulator.getBuffer());

-	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(updateVelocitiesFromPositionsWithVelocitiesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
+	cl_int ciErrNum;
+	int numVerts = m_vertexData.getNumVertices();
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel,0, sizeof(int), &numVerts);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 1, sizeof(float), &isolverdt);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 2, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 4, sizeof(cl_mem), &m_vertexData.m_clClothIdentifier.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 5, sizeof(cl_mem), &m_clPerClothVelocityCorrectionCoefficient.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 6, sizeof(cl_mem), &m_clPerClothDampingFactor.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 7, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 8, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);
+
+	size_t	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,updateVelocitiesFromPositionsWithVelocitiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(updateVelocitiesFromPositionsWithVelocitiesKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(updateVelocitiesFromPositionsWithVelocitiesKernel)");
 	}

+
 } // updateVelocitiesFromPositionsWithVelocities

 void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities( float isolverdt )
 {
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(0, m_vertexData.getNumVertices());
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(1, isolverdt);
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(2, m_vertexData.m_clVertexPosition.getBuffer());
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(3, m_vertexData.m_clVertexPreviousPosition.getBuffer());
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(4, m_vertexData.m_clClothIdentifier.getBuffer());
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(5, m_clPerClothDampingFactor.getBuffer());
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(6, m_vertexData.m_clVertexVelocity.getBuffer());
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(7, m_vertexData.m_clVertexForceAccumulator.getBuffer());

-	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
+	cl_int ciErrNum;
+	int numVerts = m_vertexData.getNumVertices();
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 0, sizeof(int), &numVerts);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 1, sizeof(float), &isolverdt);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 2, sizeof(cl_mem),&m_vertexData.m_clVertexPosition.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 3, sizeof(cl_mem),&m_vertexData.m_clVertexPreviousPosition.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 4, sizeof(cl_mem),&m_vertexData.m_clClothIdentifier.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 5, sizeof(cl_mem),&m_clPerClothDampingFactor.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 6, sizeof(cl_mem),&m_vertexData.m_clVertexVelocity.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 7, sizeof(cl_mem),&m_vertexData.m_clVertexForceAccumulator.m_buffer);
+
+	size_t	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,updateVelocitiesFromPositionsWithoutVelocitiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(updateVelocitiesFromPositionsWithoutVelocitiesKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(updateVelocitiesFromPositionsWithoutVelocitiesKernel)");
 	}
+
 } // updateVelocitiesFromPositionsWithoutVelocities

 // End kernel dispatches
@@ -1133,15 +1168,20 @@ void btOpenCLSoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * cons
 	// and use them together on a single kernel call if possible by setting up a
 	// per-cloth target buffer array for the copy kernel.

-	btAcceleratedSoftBodyInterface *currentCloth = findSoftBodyInterface( softBody );
+
+	btOpenCLAcceleratedSoftBodyInterface *currentCloth = findSoftBodyInterface( softBody );
+
+	const int firstVertex = currentCloth->getFirstVertex();
+	const int lastVertex = firstVertex + currentCloth->getNumVertices();

 	if( vertexBuffer->getBufferType() == btVertexBufferDescriptor::CPU_BUFFER )
 	{		
-		const int firstVertex = currentCloth->getFirstVertex();
-		const int lastVertex = firstVertex + currentCloth->getNumVertices();
 		const btCPUVertexBufferDescriptor *cpuVertexBuffer = static_cast< btCPUVertexBufferDescriptor* >(vertexBuffer);						
 		float *basePointer = cpuVertexBuffer->getBasePointer();						

+		m_vertexData.m_clVertexPosition.copyFromGPU();
+		m_vertexData.m_clVertexNormal.copyFromGPU();
+
 		if( vertexBuffer->hasVertexPositions() )
 		{
 			const int vertexOffset = cpuVertexBuffer->getVertexOffset();
@@ -1173,43 +1213,46 @@ void btOpenCLSoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * cons
 			}
 		}
 	}
+
 } // btCPUSoftBodySolver::outputToVertexBuffers


-btOpenCLSoftBodySolver::KernelDesc btOpenCLSoftBodySolver::compileCLKernelFromString( const char *shaderString, const char *shaderName )
+cl_kernel btOpenCLSoftBodySolver::compileCLKernelFromString( const char* kernelSource, const char* kernelName )
 {
-	cl_int err;
+	printf("compiling kernalName: %s ",kernelName);
+	cl_kernel kernel;
+	cl_int ciErrNum;
+	size_t program_length = strlen(kernelSource);

-	context = m_queue.getInfo<CL_QUEUE_CONTEXT>();
-	device = m_queue.getInfo<CL_QUEUE_DEVICE>();
-	std::vector< cl::Device > devices;
-	devices.push_back( device );
+	cl_program m_cpProgram = clCreateProgramWithSource(m_cxMainContext, 1, (const char**)&kernelSource, &program_length, &ciErrNum);
+//	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+		
+    // Build the program with 'mad' Optimization option
+#ifdef MAC
+	char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
+#else
+	const char* flags = "-DGUID_ARG=";
+#endif
+    ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, flags, NULL, NULL);
+    if (ciErrNum != CL_SUCCESS)
+    {
+        printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
+		btAssert(0);
+        exit(0);
+    }
+	
+    // Create the kernel
+    kernel = clCreateKernel(m_cpProgram, kernelName, &ciErrNum);
+    if (ciErrNum != CL_SUCCESS)
+    {
+        printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
+		btAssert(0);
+		exit(0);
+    }

-	cl::Program::Sources source(1, std::make_pair(shaderString, strlen(shaderString) + 1));
-	cl::Program program(context, source, &err);
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert( "program" );
-	}
-	err = program.build(devices);
-	if (err != CL_SUCCESS) {
-		//std::string str;
-		//str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
-		//std::cout << "Program Info: " << str;
-		if( err != CL_SUCCESS ) 
-		{
-			btAssert( "Program::build()" );
-		}
-	}
-	cl::Kernel kernel(program, shaderName, &err);
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert( "kernel" );
-	}
+	printf("ready. \n");
+	return kernel;

-	KernelDesc descriptor;
-	descriptor.kernel = kernel;
-	return descriptor;
 }

 void btOpenCLSoftBodySolver::predictMotion( float timeStep )
@@ -1234,11 +1277,11 @@ void btOpenCLSoftBodySolver::predictMotion( float timeStep )



-btOpenCLSoftBodySolver::btAcceleratedSoftBodyInterface *btOpenCLSoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
+btOpenCLAcceleratedSoftBodyInterface *btOpenCLSoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
 {
 	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
 	{
-		btAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
+		btOpenCLAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
 		if( softBodyInterface->getSoftBody() == softBody )
 			return softBodyInterface;
 	}
@@ -1273,4 +1316,4 @@ bool btOpenCLSoftBodySolver::buildShaders()
 		m_shadersInitialized = true;

 	return returnVal;
-}
+}
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
@@ -16,204 +16,165 @@ subject to the following restrictions:
 #ifndef BT_SOFT_BODY_SOLVER_OPENCL_H
 #define BT_SOFT_BODY_SOLVER_OPENCL_H

+#include "stddef.h" //for size_t
 #include "vectormath/vmInclude.h"
+
 #include "BulletSoftBody/btSoftBodySolvers.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
+#include "btSoftBodySolverLinkData_OpenCL.h"
+#include "btSoftBodySolverVertexData_OpenCL.h"
+#include "btSoftBodySolverTriangleData_OpenCL.h"
+
+
+/**
+ * SoftBody class to maintain information about a soft body instance
+ * within a solver.
+ * This data addresses the main solver arrays.
+ */
+class btOpenCLAcceleratedSoftBodyInterface
+{
+protected:
+	/** Current number of vertices that are part of this cloth */
+	int m_numVertices;
+	/** Maximum number of vertices allocated to be part of this cloth */
+	int m_maxVertices;
+	/** Current number of triangles that are part of this cloth */
+	int m_numTriangles;
+	/** Maximum number of triangles allocated to be part of this cloth */
+	int m_maxTriangles;
+	/** Index of first vertex in the world allocated to this cloth */
+	int m_firstVertex;
+	/** Index of first triangle in the world allocated to this cloth */
+	int m_firstTriangle;
+	/** Index of first link in the world allocated to this cloth */
+	int m_firstLink;
+	/** Maximum number of links allocated to this cloth */
+	int m_maxLinks;
+	/** Current number of links allocated to this cloth */
+	int m_numLinks;
+
+	/** The actual soft body this data represents */
+	btSoftBody *m_softBody;
+
+
+public:
+	btOpenCLAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
+	  m_softBody( softBody )
+	{
+		m_numVertices = 0;
+		m_maxVertices = 0;
+		m_numTriangles = 0;
+		m_maxTriangles = 0;
+		m_firstVertex = 0;
+		m_firstTriangle = 0;
+		m_firstLink = 0;
+		m_maxLinks = 0;
+		m_numLinks = 0;
+	}
+	int getNumVertices()
+	{
+		return m_numVertices;
+	}
+
+	int getNumTriangles()
+	{
+		return m_numTriangles;
+	}
+
+	int getMaxVertices()
+	{
+		return m_maxVertices;
+	}
+
+	int getMaxTriangles()
+	{
+		return m_maxTriangles;
+	}
+
+	int getFirstVertex()
+	{
+		return m_firstVertex;
+	}
+
+	int getFirstTriangle()
+	{
+		return m_firstTriangle;
+	}
+
+	// TODO: All of these set functions will have to do checks and
+	// update the world because restructuring of the arrays will be necessary
+	// Reasonable use of "friend"?
+	void setNumVertices( int numVertices )
+	{
+		m_numVertices = numVertices;
+	}	
+	
+	void setNumTriangles( int numTriangles )
+	{
+		m_numTriangles = numTriangles;
+	}
+
+	void setMaxVertices( int maxVertices )
+	{
+		m_maxVertices = maxVertices;
+	}
+
+	void setMaxTriangles( int maxTriangles )
+	{
+		m_maxTriangles = maxTriangles;
+	}
+
+	void setFirstVertex( int firstVertex )
+	{
+		m_firstVertex = firstVertex;
+	}
+
+	void setFirstTriangle( int firstTriangle )
+	{
+		m_firstTriangle = firstTriangle;
+	}
+
+	void setMaxLinks( int maxLinks )
+	{
+		m_maxLinks = maxLinks;
+	}
+
+	void setNumLinks( int numLinks )
+	{
+		m_numLinks = numLinks;
+	}
+
+	void setFirstLink( int firstLink )
+	{
+		m_firstLink = firstLink;
+	}
+
+	int getMaxLinks()
+	{
+		return m_maxLinks;
+	}
+
+	int getNumLinks()
+	{
+		return m_numLinks;
+	}
+
+	int getFirstLink()
+	{
+		return m_firstLink;
+	}
+
+	btSoftBody* getSoftBody()
+	{
+		return m_softBody;
+	}
+
+};


 class btOpenCLSoftBodySolver : public btSoftBodySolver
 {
 private:
-	/**
-	 * SoftBody class to maintain information about a soft body instance
-	 * within a solver.
-	 * This data addresses the main solver arrays.
-	 */
-	class btAcceleratedSoftBodyInterface
-	{
-	protected:
-		/** Current number of vertices that are part of this cloth */
-		int m_numVertices;
-		/** Maximum number of vertices allocated to be part of this cloth */
-		int m_maxVertices;
-		/** Current number of triangles that are part of this cloth */
-		int m_numTriangles;
-		/** Maximum number of triangles allocated to be part of this cloth */
-		int m_maxTriangles;
-		/** Index of first vertex in the world allocated to this cloth */
-		int m_firstVertex;
-		/** Index of first triangle in the world allocated to this cloth */
-		int m_firstTriangle;
-		/** Index of first link in the world allocated to this cloth */
-		int m_firstLink;
-		/** Maximum number of links allocated to this cloth */
-		int m_maxLinks;
-		/** Current number of links allocated to this cloth */
-		int m_numLinks;
-
-		/** The actual soft body this data represents */
-		btSoftBody *m_softBody;
-
-
-	public:
-		btAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
-		  m_softBody( softBody )
-		{
-			m_numVertices = 0;
-			m_maxVertices = 0;
-			m_numTriangles = 0;
-			m_maxTriangles = 0;
-			m_firstVertex = 0;
-			m_firstTriangle = 0;
-			m_firstLink = 0;
-			m_maxLinks = 0;
-			m_numLinks = 0;
-		}
-		int getNumVertices()
-		{
-			return m_numVertices;
-		}
-
-		int getNumTriangles()
-		{
-			return m_numTriangles;
-		}
-
-		int getMaxVertices()
-		{
-			return m_maxVertices;
-		}
-
-		int getMaxTriangles()
-		{
-			return m_maxTriangles;
-		}
-
-		int getFirstVertex()
-		{
-			return m_firstVertex;
-		}
-
-		int getFirstTriangle()
-		{
-			return m_firstTriangle;
-		}
-
-		// TODO: All of these set functions will have to do checks and
-		// update the world because restructuring of the arrays will be necessary
-		// Reasonable use of "friend"?
-		void setNumVertices( int numVertices )
-		{
-			m_numVertices = numVertices;
-		}	
-		
-		void setNumTriangles( int numTriangles )
-		{
-			m_numTriangles = numTriangles;
-		}
-
-		void setMaxVertices( int maxVertices )
-		{
-			m_maxVertices = maxVertices;
-		}
-
-		void setMaxTriangles( int maxTriangles )
-		{
-			m_maxTriangles = maxTriangles;
-		}
-
-		void setFirstVertex( int firstVertex )
-		{
-			m_firstVertex = firstVertex;
-		}
-
-		void setFirstTriangle( int firstTriangle )
-		{
-			m_firstTriangle = firstTriangle;
-		}
-
-		void setMaxLinks( int maxLinks )
-		{
-			m_maxLinks = maxLinks;
-		}
-
-		void setNumLinks( int numLinks )
-		{
-			m_numLinks = numLinks;
-		}
-
-		void setFirstLink( int firstLink )
-		{
-			m_firstLink = firstLink;
-		}
-
-		int getMaxLinks()
-		{
-			return m_maxLinks;
-		}
-
-		int getNumLinks()
-		{
-			return m_numLinks;
-		}
-
-		int getFirstLink()
-		{
-			return m_firstLink;
-		}
-
-		btSoftBody* getSoftBody()
-		{
-			return m_softBody;
-		}
-
-	#if 0
-		void setAcceleration( Vectormath::Aos::Vector3 acceleration )
-		{
-			m_currentSolver->setPerClothAcceleration( m_clothIdentifier, acceleration );
-		}
-
-		void setWindVelocity( Vectormath::Aos::Vector3 windVelocity )
-		{
-			m_currentSolver->setPerClothWindVelocity( m_clothIdentifier, windVelocity );
-		}
-
-		/** 
-		 * Set the density of the air in which the cloth is situated.
-		 */
-		void setAirDensity( btScalar density )
-		{
-			m_currentSolver->setPerClothMediumDensity( m_clothIdentifier, static_cast<float>(density) );
-		}
-
-		/**
-		 * Add a collision object to this soft body.
-		 */
-		void addCollisionObject( btCollisionObject *collisionObject )
-		{
-			m_currentSolver->addCollisionObjectForSoftBody( m_clothIdentifier, collisionObject );
-		}
-	#endif
-	};
-
-
-	class KernelDesc
-	{
-	protected:
-	public:
-		cl::Kernel kernel;
-
-		KernelDesc()
-		{
-		}
-
-		virtual ~KernelDesc()
-		{
-		}
-	}; 

 	btSoftBodyLinkDataOpenCL m_linkData;
 	btSoftBodyVertexDataOpenCL m_vertexData;
@@ -228,7 +189,7 @@ private:
 	 * Cloths owned by this solver.
 	 * Only our cloths are in this array.
 	 */
-	btAlignedObjectArray< btAcceleratedSoftBodyInterface * > m_softBodySet;
+	btAlignedObjectArray< btOpenCLAcceleratedSoftBodyInterface * > m_softBodySet;

 	/** Acceleration value to be applied to all non-static vertices in the solver. 
 	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
@@ -262,37 +223,34 @@ private:
 	btAlignedObjectArray< float >						m_perClothMediumDensity;
 	btOpenCLBuffer<float>								m_clPerClothMediumDensity;

-	KernelDesc		prepareLinksKernel;
-	KernelDesc		solvePositionsFromLinksKernel;
-	KernelDesc		updateConstantsKernel;
-	KernelDesc		integrateKernel;
-	KernelDesc		addVelocityKernel;
-	KernelDesc		updatePositionsFromVelocitiesKernel;
-	KernelDesc		updateVelocitiesFromPositionsWithoutVelocitiesKernel;
-	KernelDesc		updateVelocitiesFromPositionsWithVelocitiesKernel;
-	KernelDesc		vSolveLinksKernel;
-	KernelDesc		resetNormalsAndAreasKernel;
-	KernelDesc		normalizeNormalsAndAreasKernel;
-	KernelDesc		updateSoftBodiesKernel;
-	KernelDesc		outputToVertexArrayWithNormalsKernel;
-	KernelDesc		outputToVertexArrayWithoutNormalsKernel;
+	cl_kernel		prepareLinksKernel;
+	cl_kernel		solvePositionsFromLinksKernel;
+	cl_kernel		updateConstantsKernel;
+	cl_kernel		integrateKernel;
+	cl_kernel		addVelocityKernel;
+	cl_kernel		updatePositionsFromVelocitiesKernel;
+	cl_kernel		updateVelocitiesFromPositionsWithoutVelocitiesKernel;
+	cl_kernel		updateVelocitiesFromPositionsWithVelocitiesKernel;
+	cl_kernel		vSolveLinksKernel;
+	cl_kernel		resetNormalsAndAreasKernel;
+	cl_kernel		normalizeNormalsAndAreasKernel;
+	cl_kernel		updateSoftBodiesKernel;
+	cl_kernel		outputToVertexArrayWithNormalsKernel;
+	cl_kernel		outputToVertexArrayWithoutNormalsKernel;

-	KernelDesc		outputToVertexArrayKernel;
-	KernelDesc		applyForcesKernel;
-	KernelDesc		collideSphereKernel;
-	KernelDesc		collideCylinderKernel;
+	cl_kernel		outputToVertexArrayKernel;
+	cl_kernel		applyForcesKernel;
+	cl_kernel		collideSphereKernel;
+	cl_kernel		collideCylinderKernel;

-	static const int workGroupSize = 128;
-
-	cl::CommandQueue m_queue;
-	cl::Context context;
-	cl::Device device;
+	cl_command_queue	m_cqCommandQue;
+	cl_context			m_cxMainContext;


 	/**
-	 * Compile a compute shader kernel from a string and return the appropriate KernelDesc object.
+	 * Compile a compute shader kernel from a string and return the appropriate cl_kernel object.
 	 */
-	KernelDesc compileCLKernelFromString( const char *shaderString, const char *shaderName );
+	cl_kernel compileCLKernelFromString( const char *shaderString, const char *shaderName );

 	bool buildShaders();

@@ -306,7 +264,7 @@ private:

 	void ApplyClampedForce( float solverdt, const Vectormath::Aos::Vector3 &force, const Vectormath::Aos::Vector3 &vertexVelocity, float inverseMass, Vectormath::Aos::Vector3 &vertexForce );
 	
-	btAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
+	btOpenCLAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );

 	virtual void applyForces( float solverdt );

@@ -342,7 +300,7 @@ private:
 	

 public:
-	btOpenCLSoftBodySolver(const cl::CommandQueue &queue);
+	btOpenCLSoftBodySolver(cl_command_queue queue,cl_context	ctx);

 	virtual ~btOpenCLSoftBodySolver();

@@ -371,4 +329,4 @@ public:
 	virtual void copySoftBodyToVertexBuffer( const btSoftBody *const softBody, btVertexBufferDescriptor *vertexBuffer );
 }; // btOpenCLSoftBodySolver

-#endif #ifndef BT_SOFT_BODY_SOLVER_OPENCL_H
+#endif #ifndef BT_SOFT_BODY_SOLVER_OPENCL_H