diff --git a/Demos/CMakeLists.txt b/Demos/CMakeLists.txt
index 7d70282ca..35d90a508 100644
--- a/Demos/CMakeLists.txt
+++ b/Demos/CMakeLists.txt
@@ -14,7 +14,7 @@ IF(BUILD_CPU_DEMOS)
 		CollisionInterfaceDemo ConcaveConvexcastDemo SimplexDemo DynamicControlDemo
 		DoublePrecisionDemo ConcaveDemo CollisionDemo
 		ContinuousConvexCollision ConcaveRaycastDemo GjkConvexCastDemo
-		MultiMaterialDemo SerializeDemo InternalEdgeDemo
+		MultiMaterialDemo SerializeDemo InternalEdgeDemo 
 	)
 ELSE()
 	SET(SharedDemoSubdirs
@@ -28,6 +28,7 @@ ENDIF()
 		MultiThreadedDemo
 		VectorAdd_OpenCL
 		ParticlesOpenCL
+		OpenCLClothDemo
 		)
 
 ELSE (USE_GLUT)
diff --git a/Demos/DX11ClothDemo/btDirectComputeSupport.h b/Demos/DX11ClothDemo/btDirectComputeSupport.h
index 52843484a..69360865a 100644
--- a/Demos/DX11ClothDemo/btDirectComputeSupport.h
+++ b/Demos/DX11ClothDemo/btDirectComputeSupport.h
@@ -1,6 +1,6 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+Copyright (c) 2010 Advanced Micro Devices
 
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
@@ -13,6 +13,8 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
+
+
 #ifndef BT_DIRECT_COMPUTE_SUPPORT_HPP
 #define BT_DIRECT_COMPUTE_SUPPORT_HPP
 
diff --git a/Demos/DX11ClothDemo/cap.h b/Demos/DX11ClothDemo/cap.h
index 38cfae21b..e2d3d8e81 100644
--- a/Demos/DX11ClothDemo/cap.h
+++ b/Demos/DX11ClothDemo/cap.h
@@ -1,3 +1,18 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2010 Advanced Micro Devices
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
 
 class cap 
 {
diff --git a/Demos/DX11ClothDemo/cloth.h b/Demos/DX11ClothDemo/cloth.h
index fd1983811..c130548c1 100644
--- a/Demos/DX11ClothDemo/cloth.h
+++ b/Demos/DX11ClothDemo/cloth.h
@@ -1,4 +1,22 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2010 Advanced Micro Devices
 
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include <fstream>
+#include <iostream>
+#include <iomanip>
 
 class piece_of_cloth 
 {
@@ -171,7 +189,8 @@ public:
 
 			pd3dImmediateContext->PSSetShaderResources(0,1,&texture2D_view);
 
-			pd3dImmediateContext->DrawIndexed( (width*3*2+2 + height*width*3*2), 0, ( UINT )pSubset->VertexStart );
+			//pd3dImmediateContext->DrawIndexed( (width*3*2+2 + height*width*3*2), 0, ( UINT )pSubset->VertexStart );
+			pd3dImmediateContext->DrawIndexed( ((height-1)*(width-1)*3*2), 0, ( UINT )pSubset->VertexStart );
 		}
 
 		SAFE_RELEASE(pd3dImmediateContext);
@@ -246,7 +265,7 @@ public:
 
 
 		//unsigned int indices[] = {0,1,2, 1,3,2};
-		unsigned int* indices = new unsigned int[width*3*2+2 + height*width*3*2];
+		unsigned int* indices = new unsigned int[(height-1)*(width-1)*3*2];
 
 		for(int y = 0; y < height-1; y++)
 		{
@@ -265,7 +284,8 @@ public:
 			}
 		}
 
-		bufferDesc.ByteWidth = sizeof(unsigned int)*(width*3*2+2 + height*width*3*2);
+
+		bufferDesc.ByteWidth = sizeof(unsigned int)*((height-1)*(width-1)*3*2);
 		bufferDesc.BindFlags = D3D11_BIND_INDEX_BUFFER;
 
 		InitData.pSysMem = indices;
diff --git a/Demos/DX11ClothDemo/cloth_renderer.cpp b/Demos/DX11ClothDemo/cloth_renderer.cpp
index 9d65257cb..a5c95e51c 100644
--- a/Demos/DX11ClothDemo/cloth_renderer.cpp
+++ b/Demos/DX11ClothDemo/cloth_renderer.cpp
@@ -32,18 +32,15 @@ class btDX11SIMDAwareSoftBodySolver;
 #include "BulletSoftBody/btSoftBodySolvers.h"
 #include "BulletSoftBody/btDefaultSoftBodySolver.h"
 #include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolver_CPU.h"
-//#include "BulletSoftBody/Solvers/CPU/btAcceleratedSoftBody_CPUVertexSolver.h"
 #include "BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h"
-//#include "BulletSoftBody/Solvers/DX11/btAcceleratedSoftBody_DX11SIMDAwareSolver.h"
-//#include "BulletSoftBody/btAcceleratedSoftBody_DXVertexBuffers.h"
+#include "BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.h"
 
 #include "BulletSoftBody/btSoftBodyRigidBodyCollisionConfiguration.h"
 
-//#define USE_SIMDAWARE_SOLVER
-#define USE_GPU_SOLVER
-//#define USE_VERTEX_SOLVER
+#define USE_SIMDAWARE_SOLVER
+//#define USE_GPU_SOLVER
 #define USE_GPU_COPY
-const int numFlags = 2;
+const int numFlags = 5;
 const int clothWidth = 40;
 const int clothHeight = 60;//60;
 float _windAngle = 1.0;//0.4;
@@ -206,6 +203,7 @@ btSoftRigidDynamicsWorld* m_dynamicsWorld;
 btDefaultSoftBodySolver *g_defaultSolver = NULL;
 btCPUSoftBodySolver *g_cpuSolver = NULL;
 btDX11SoftBodySolver *g_dx11Solver = NULL;
+btDX11SIMDAwareSoftBodySolver *g_dx11SIMDSolver = NULL;
 
 btSoftBodySolver *g_solver = NULL;
 
@@ -454,12 +452,17 @@ void initBullet(void)
 #ifdef USE_GPU_SOLVER
 	g_dx11Solver = new btDX11SoftBodySolver( g_pd3dDevice, DXUTGetD3D11DeviceContext() );
 	g_solver = g_dx11Solver;
+#else
+#ifdef USE_SIMDAWARE_SOLVER
+	g_dx11SIMDSolver = new btDX11SIMDAwareSoftBodySolver( g_pd3dDevice, DXUTGetD3D11DeviceContext() );
+	g_solver = g_dx11SIMDSolver;
 #else
 	g_cpuSolver = new btCPUSoftBodySolver;
 	g_solver = g_cpuSolver;
 	//g_defaultSolver = new btDefaultSoftBodySolver;
 	//g_solver = g_defaultSolver;
 #endif
+#endif
 
 
 
@@ -1260,6 +1263,9 @@ void CALLBACK OnD3D11DestroyDevice( void* pUserContext )
 		delete g_cpuSolver;
 	if( g_dx11Solver )
 		delete g_dx11Solver;
+	if( g_dx11SIMDSolver )
+		delete g_dx11SIMDSolver;
+	
 
 	for(int i=0; i< m_collisionShapes.size(); i++)
 		delete m_collisionShapes[i];
diff --git a/Demos/DX11ClothDemo/cylinder.h b/Demos/DX11ClothDemo/cylinder.h
index a9c6edb7e..517fcdf51 100644
--- a/Demos/DX11ClothDemo/cylinder.h
+++ b/Demos/DX11ClothDemo/cylinder.h
@@ -1,3 +1,18 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2010 Advanced Micro Devices
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
 
 class cylinder 
 {
diff --git a/Demos/OpenCLClothDemo/AMD/CMakeLists.txt b/Demos/OpenCLClothDemo/AMD/CMakeLists.txt
new file mode 100644
index 000000000..4140d59f4
--- /dev/null
+++ b/Demos/OpenCLClothDemo/AMD/CMakeLists.txt
@@ -0,0 +1,102 @@
+
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src 
+${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL
+${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL
+)
+
+ADD_DEFINITIONS(-DUSE_AMD_OPENCL)
+ADD_DEFINITIONS(-DCL_PLATFORM_AMD)
+
+
+IF (INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+	INCLUDE_DIRECTORIES(		$ENV{==ATISTREAMSDKROOT=}/include )
+	IF (CMAKE_CL_64)
+		SET(CMAK_ATISTREAMSDK_LIBPATH 		$ENV{==ATISTREAMSDKROOT=}/lib/x86_64		)
+	ELSE(CMAKE_CL_64)
+		SET(CMAK_ATISTREAMSDK_LIBPATH		$ENV{==ATISTREAMSDKROOT=}/lib/x86		)
+	ENDIF(CMAKE_CL_64)
+ELSE()
+	INCLUDE_DIRECTORIES(		$ENV{ATISTREAMSDKROOT}/include	)
+	IF (CMAKE_CL_64)
+		SET(CMAK_ATISTREAMSDK_LIBPATH 		$ENV{ATISTREAMSDKROOT}/lib/x86_64 )
+	ELSE(CMAKE_CL_64)
+		SET(CMAK_ATISTREAMSDK_LIBPATH		$ENV{ATISTREAMSDKROOT}/lib/x86		)
+	ENDIF(CMAKE_CL_64)
+ENDIF()
+
+
+IF (CMAKE_CL_64)
+	SET(CMAK_GLEW_LIBRARY
+		${BULLET_PHYSICS_SOURCE_DIR}/Glut/glew64.lib		)
+ELSE(CMAKE_CL_64)
+	SET(CMAK_GLEW_LIBRARY		${BULLET_PHYSICS_SOURCE_DIR}/Glut/glew32.lib		)
+ENDIF(CMAKE_CL_64)
+
+
+IF (USE_GLUT)
+	LINK_LIBRARIES(
+		OpenGLSupport 
+		BulletSoftBodySolvers_OpenCL_AMD
+		BulletSoftBodySolvers_CPU
+		BulletMultiThreaded
+		BulletSoftBody
+		BulletDynamics  
+		BulletCollision  
+		LinearMath 
+		${GLUT_glut_LIBRARY} 
+		${OPENGL_gl_LIBRARY} 
+		${OPENGL_glu_LIBRARY}
+		${CMAK_GLEW_LIBRARY}
+		${CMAK_ATISTREAMSDK_LIBPATH}/OpenCL.lib
+	)
+
+
+	ADD_EXECUTABLE(AppOpenCLClothDemo_AMD
+		../cl_cloth_demo.cpp
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.h
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.h
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.cpp
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.cpp
+		../gl_win.cpp
+		../clstuff.cpp
+		../bmpLoader.cpp
+		../bmpLoader.h
+		../clstuff.h
+		../gl_win.h
+
+	)
+ELSE (USE_GLUT)
+ENDIF (USE_GLUT)
+
+IF(WIN32)
+IF (CMAKE_CL_64)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_AMD		POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR}	
+					)
+		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_AMD	POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW64.DLL ${CMAKE_CURRENT_BINARY_DIR})
+	ENDIF()
+ELSE(CMAKE_CL_64)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_AMD	POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR}
+					)
+		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_AMD	POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW32.DLL ${CMAKE_CURRENT_BINARY_DIR})
+
+	ENDIF()
+ENDIF(CMAKE_CL_64)
+ENDIF(WIN32)
+
+ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_AMD	POST_BUILD
+			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/amdFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
+			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/atiFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
+			)
+
+IF (UNIX)
+  TARGET_LINK_LIBRARIES(AppOpenCLClothDemo_AMD pthread)
+ENDIF(UNIX)
+
diff --git a/Demos/OpenCLClothDemo/Apple/CMakeLists.txt b/Demos/OpenCLClothDemo/Apple/CMakeLists.txt
new file mode 100644
index 000000000..e89513c18
--- /dev/null
+++ b/Demos/OpenCLClothDemo/Apple/CMakeLists.txt
@@ -0,0 +1,60 @@
+
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src 
+${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL
+${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL
+)
+
+
+IF (APPLE)
+  FIND_LIBRARY(OPENCL_LIBRARY OpenCL DOC "OpenCL lib for OSX")
+  FIND_PATH(OPENCL_INCLUDE_DIR OpenCL/cl.h DOC "Include for OpenCL on OSX")
+ENDIF (APPLE)
+
+
+IF (USE_GLUT)
+	LINK_LIBRARIES(
+		OpenGLSupport 
+		BulletSoftBodySolvers_OpenCL_Apple
+		BulletSoftBodySolvers_CPU
+		BulletMultiThreaded
+		BulletSoftBody
+		BulletDynamics  
+		BulletCollision  
+		LinearMath
+		${OPENCL_LIBRARY}
+		${GLUT_glut_LIBRARY} 
+		${OPENGL_gl_LIBRARY} 
+		${OPENGL_glu_LIBRARY}
+		${CMAK_GLEW_LIBRARY}
+	)
+
+
+	ADD_EXECUTABLE(AppOpenCLClothDemo_Apple
+		../cl_cloth_demo.cpp
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.h
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.h
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.cpp
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.cpp
+		../gl_win.cpp
+		../clstuff.cpp
+		../bmpLoader.cpp
+		../bmpLoader.h
+		../clstuff.h
+		../gl_win.h
+
+	)
+ELSE (USE_GLUT)
+ENDIF (USE_GLUT)
+
+
+ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_Apple	POST_BUILD
+			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/amdFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
+			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/atiFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
+			)
+
+IF (UNIX)
+  TARGET_LINK_LIBRARIES(AppOpenCLClothDemo_Apple pthread)
+ENDIF(UNIX)
+
diff --git a/Demos/OpenCLClothDemo/CLClothDemo.sln b/Demos/OpenCLClothDemo/CLClothDemo.sln
new file mode 100644
index 000000000..48af26cde
--- /dev/null
+++ b/Demos/OpenCLClothDemo/CLClothDemo.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CLClothDemo", "CLClothDemo.vcproj", "{A61906AF-B5DE-454E-99F6-B653C250D221}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{A61906AF-B5DE-454E-99F6-B653C250D221}.Debug|Win32.ActiveCfg = Debug|Win32
+		{A61906AF-B5DE-454E-99F6-B653C250D221}.Debug|Win32.Build.0 = Debug|Win32
+		{A61906AF-B5DE-454E-99F6-B653C250D221}.Release|Win32.ActiveCfg = Release|Win32
+		{A61906AF-B5DE-454E-99F6-B653C250D221}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Demos/OpenCLClothDemo/CLClothDemo.vcproj b/Demos/OpenCLClothDemo/CLClothDemo.vcproj
new file mode 100644
index 000000000..1023daf69
--- /dev/null
+++ b/Demos/OpenCLClothDemo/CLClothDemo.vcproj
@@ -0,0 +1,233 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="CLClothDemo"
+	ProjectGUID="{A61906AF-B5DE-454E-99F6-B653C250D221}"
+	RootNamespace="CLClothDemo"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="S:\SVN\GpuClothAMD\Bullet\BulletTrunk\Glut;&quot;C:\Program Files (x86)\ATI Stream\include&quot;;..\..\..\projects\physics\Bullet\BulletTrunk\src;S:\SVN\GpuClothAMD\Bullet\BulletTrunk\src"
+				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkLibraryDependencies="false"
+				AdditionalDependencies="glew32.lib OpenCL.lib ..\..\lib\Debug\BulletDynamics.lib ..\..\lib\Debug\BulletCollision.lib ..\..\lib\Debug\LinearMath.lib ..\..\lib\Debug\BulletSoftBody.lib ..\..\lib\Debug\BulletSoftBodySolvers_CPU.lib ..\..\lib\Debug\BulletSoftBodySolvers_OpenCL.lib"
+				LinkIncremental="1"
+				AdditionalLibraryDirectories="&quot;C:\Program Files (x86)\ATI Stream\lib\x86&quot;;S:\SVN\GpuClothAMD\Bullet\BulletTrunk\Glut;S:\SVN\GpuClothAMD\Bullet\BulletTrunk\lib\Debug"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				ImportLibrary="S:\SVN\GpuClothAMD\Bullet\BulletTrunk\Demos\DX11ClothDemo\Debug\AppDX11ClothDemo.lib"
+				TargetMachine="0"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				AdditionalIncludeDirectories="S:\SVN\GpuClothAMD\Bullet\BulletTrunk\Glut;&quot;C:\Program Files (x86)\ATI Stream\include&quot;;..\..\..\projects\physics\Bullet\BulletTrunk\src;S:\SVN\GpuClothAMD\Bullet\BulletTrunk\src"
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="glew32.lib OpenCL.lib BulletSoftBody.lib BulletDynamics.lib BulletCollision.lib LinearMath.lib BulletSoftBodySolvers_CPU.lib BulletSoftBodySolvers_OpenCL.lib"
+				LinkIncremental="1"
+				AdditionalLibraryDirectories="..\Bullet\BulletTrunk\lib\Release\;&quot;C:\Program Files (x86)\ATI Stream\lib\x86&quot;;S:\SVN\GpuClothAMD\Bullet\BulletTrunk\Glut;S:\SVN\GpuClothAMD\Bullet\BulletTrunk\lib\Release"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+			>
+			<File
+				RelativePath=".\bmpLoader.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\cl_cloth_demo.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\clstuff.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\gl_win.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+			>
+			<File
+				RelativePath=".\bmpLoader.hpp"
+				>
+			</File>
+			<File
+				RelativePath=".\btOpenCLSupport.h"
+				>
+			</File>
+			<File
+				RelativePath=".\cloth.h"
+				>
+			</File>
+			<File
+				RelativePath=".\clstuff.hpp"
+				>
+			</File>
+			<File
+				RelativePath=".\gl_win.hpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
+			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+			>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff --git a/Demos/OpenCLClothDemo/CMakeLists.txt b/Demos/OpenCLClothDemo/CMakeLists.txt
new file mode 100644
index 000000000..1f378a3e1
--- /dev/null
+++ b/Demos/OpenCLClothDemo/CMakeLists.txt
@@ -0,0 +1,15 @@
+IF(BUILD_MINICL_OPENCL_DEMOS)
+	SUBDIRS( MiniCL  )
+ENDIF()
+
+IF(BUILD_AMD_OPENCL_DEMOS)
+	SUBDIRS(AMD)
+ENDIF()
+
+IF(BUILD_NVIDIA_OPENCL_DEMOS)
+	SUBDIRS(NVidia)
+ENDIF()
+
+IF(APPLE)
+	SUBDIRS(Apple)
+ENDIF()
diff --git a/Demos/OpenCLClothDemo/MiniCL/CMakeLists.txt b/Demos/OpenCLClothDemo/MiniCL/CMakeLists.txt
new file mode 100644
index 000000000..e6e216763
--- /dev/null
+++ b/Demos/OpenCLClothDemo/MiniCL/CMakeLists.txt
@@ -0,0 +1,86 @@
+
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src 
+${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL
+${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL
+)
+
+ADD_DEFINITIONS(-DUSE_MINICL)
+
+IF (WIN32)
+	IF (CMAKE_CL_64)
+		SET(CMAK_GLEW_LIBRARY
+			${BULLET_PHYSICS_SOURCE_DIR}/Glut/glew64.lib		)
+	ELSE(CMAKE_CL_64)
+		SET(CMAK_GLEW_LIBRARY		${BULLET_PHYSICS_SOURCE_DIR}/Glut/glew32.lib		)
+	ENDIF(CMAKE_CL_64)
+ENDIF()
+
+IF (USE_GLUT)
+	LINK_LIBRARIES(
+		OpenGLSupport 
+		BulletSoftBodySolvers_OpenCL_Mini
+		BulletSoftBodySolvers_CPU
+		MiniCL
+		BulletMultiThreaded
+		BulletSoftBody
+		BulletDynamics  
+		BulletCollision  
+		LinearMath 
+		${GLUT_glut_LIBRARY} 
+		${OPENGL_gl_LIBRARY} 
+		${OPENGL_glu_LIBRARY}
+		${CMAK_GLEW_LIBRARY}
+		
+	)
+
+
+	ADD_EXECUTABLE(AppOpenCLClothDemo_Mini
+		../cl_cloth_demo.cpp
+		../gl_win.cpp
+		../clstuff.cpp
+		../bmpLoader.cpp
+		../bmpLoader.h
+		../clstuff.h
+		../gl_win.h
+		${BULLET_PHYSICS_SOURCE_DIR}/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.h
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.h
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.cpp
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.cpp
+
+	)
+ELSE (USE_GLUT)
+ENDIF (USE_GLUT)
+
+IF(WIN32)
+IF (CMAKE_CL_64)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_Mini		POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR}	
+					)
+		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_Mini	POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW64.DLL ${CMAKE_CURRENT_BINARY_DIR})
+	ENDIF()
+ELSE(CMAKE_CL_64)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_Mini	POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR}
+					)
+		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_Mini	POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW32.DLL ${CMAKE_CURRENT_BINARY_DIR})
+
+	ENDIF()
+ENDIF(CMAKE_CL_64)
+ENDIF(WIN32)
+
+ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_Mini	POST_BUILD
+			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/amdFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
+			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/atiFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
+			)
+
+IF (UNIX)
+  TARGET_LINK_LIBRARIES(AppOpenCLClothDemo_Mini pthread)
+ENDIF(UNIX)
+
diff --git a/Demos/OpenCLClothDemo/NVidia/CMakeLists.txt b/Demos/OpenCLClothDemo/NVidia/CMakeLists.txt
new file mode 100644
index 000000000..d41b8f377
--- /dev/null
+++ b/Demos/OpenCLClothDemo/NVidia/CMakeLists.txt
@@ -0,0 +1,102 @@
+
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src 
+${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL
+${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL
+)
+
+
+
+
+IF(INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+	INCLUDE_DIRECTORIES( $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/inc	)
+	IF (CMAKE_CL_64)
+		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/x64    )
+	ELSE(CMAKE_CL_64)
+		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/Win32	)
+	ENDIF(CMAKE_CL_64)
+ELSE()
+	INCLUDE_DIRECTORIES( $ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/inc	)
+	IF (CMAKE_CL_64)
+		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/lib/x64 )
+	ELSE(CMAKE_CL_64)
+		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/lib/Win32	)
+	ENDIF(CMAKE_CL_64)
+ENDIF()
+
+
+
+IF (CMAKE_CL_64)
+	SET(CMAK_GLEW_LIBRARY
+		${BULLET_PHYSICS_SOURCE_DIR}/Glut/glew64.lib		)
+ELSE(CMAKE_CL_64)
+	SET(CMAK_GLEW_LIBRARY		${BULLET_PHYSICS_SOURCE_DIR}/Glut/glew32.lib		)
+ENDIF(CMAKE_CL_64)
+
+
+IF (USE_GLUT)
+	LINK_LIBRARIES(
+		OpenGLSupport 
+		BulletSoftBodySolvers_OpenCL_NVidia
+		BulletSoftBodySolvers_CPU
+		BulletMultiThreaded
+		BulletSoftBody
+		BulletDynamics  
+		BulletCollision  
+		LinearMath 
+		${GLUT_glut_LIBRARY} 
+		${OPENGL_gl_LIBRARY} 
+		${OPENGL_glu_LIBRARY}
+		${CMAK_GLEW_LIBRARY}
+		${CMAK_NVSDKCOMPUTE_LIBPATH}/OpenCL.lib
+	)
+
+
+	ADD_EXECUTABLE(AppOpenCLClothDemo_NVidia
+		../cl_cloth_demo.cpp
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.h
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.h
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.cpp
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.cpp
+		../gl_win.cpp
+		../clstuff.cpp
+		../bmpLoader.cpp
+		../bmpLoader.h
+		../clstuff.h
+		../gl_win.h
+
+	)
+ELSE (USE_GLUT)
+ENDIF (USE_GLUT)
+
+IF(WIN32)
+IF (CMAKE_CL_64)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_NVidia		POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR}	
+					)
+		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_NVidia	POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW64.DLL ${CMAKE_CURRENT_BINARY_DIR})
+	ENDIF()
+ELSE(CMAKE_CL_64)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_NVidia	POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR}
+					)
+		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_NVidia	POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW32.DLL ${CMAKE_CURRENT_BINARY_DIR})
+
+	ENDIF()
+ENDIF(CMAKE_CL_64)
+ENDIF(WIN32)
+
+ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_NVidia	POST_BUILD
+			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/amdFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
+			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/atiFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
+			)
+
+IF (UNIX)
+  TARGET_LINK_LIBRARIES(AppOpenCLClothDemo_NVidia pthread)
+ENDIF(UNIX)
+
diff --git a/Demos/OpenCLClothDemo/amdFlag.bmp b/Demos/OpenCLClothDemo/amdFlag.bmp
new file mode 100644
index 000000000..dd1d394ec
Binary files /dev/null and b/Demos/OpenCLClothDemo/amdFlag.bmp differ
diff --git a/Demos/OpenCLClothDemo/atiFlag.bmp b/Demos/OpenCLClothDemo/atiFlag.bmp
new file mode 100644
index 000000000..2be4847dd
Binary files /dev/null and b/Demos/OpenCLClothDemo/atiFlag.bmp differ
diff --git a/Demos/OpenCLClothDemo/bmpLoader.cpp b/Demos/OpenCLClothDemo/bmpLoader.cpp
new file mode 100644
index 000000000..75e89f433
--- /dev/null
+++ b/Demos/OpenCLClothDemo/bmpLoader.cpp
@@ -0,0 +1,325 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2010 Advanced Micro Devices
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "bmpLoader.h"
+
+#include <new>
+#include <cstring>
+#include <cstdio>
+
+namespace amd
+{
+
+static const short bitMapID = 19778;
+
+void
+BitMap::releaseResources(void)
+{
+    if (pixels_ != NULL) {
+        delete[] pixels_;
+    }
+
+    if (colors_ != NULL) {
+        delete[] colors_;
+    }
+
+    pixels_    = NULL;
+    colors_    = NULL;
+    isLoaded_  = false;
+}
+
+BitMap& BitMap::operator=(const BitMap& rhs)
+{
+    if (this == &rhs) {
+        return *this;
+    }
+
+    // Copy header
+    id         = rhs.id;
+    size       = rhs.size;
+    reserved1  = rhs.reserved1;
+    reserved2  = rhs.reserved2;
+    offset     = rhs.offset;
+
+    // Copy header info
+    sizeInfo       = rhs.sizeInfo;
+    width          = rhs.width;
+    height         = rhs.height;
+    planes         = rhs.planes;
+    bitsPerPixel   = rhs.bitsPerPixel;
+    compression    = rhs.compression;
+    imageSize      = rhs.imageSize;
+    xPelsPerMeter  = rhs.xPelsPerMeter;
+    yPelsPerMeter  = rhs.yPelsPerMeter;
+    clrUsed        = rhs.clrUsed;
+    clrImportant   = rhs.clrImportant;
+
+    numColors_ = rhs.numColors_;
+    isLoaded_  = rhs.isLoaded_;
+
+    pixels_    = NULL;
+    colors_    = NULL;
+    if (isLoaded_) {
+        if (rhs.colors_ != NULL) {
+            colors_ = new ColorPalette[numColors_];
+            if (colors_ == NULL) {
+                isLoaded_ = false;
+                return *this;
+            }
+            memcpy(colors_, rhs.colors_, numColors_ * sizeof(ColorPalette));
+         }
+
+        pixels_ = new uchar4[width * height];
+        if (pixels_ == NULL) {
+            delete[] colors_;
+            colors_   = NULL;
+            isLoaded_ = false;
+            return *this;
+        }
+        memcpy(pixels_, rhs.pixels_, width * height * sizeof(uchar4));
+    }
+
+    return *this;
+}
+
+void
+BitMap::load(const char * filename)
+{
+    // Release any existing resources
+    releaseResources();
+
+    // Open BMP file
+    FILE * fd = fopen(filename, "rb");
+
+    // Opened OK
+    if (fd != NULL) {
+        // Read header
+        fread((BitMapHeader *)this, sizeof(BitMapHeader), 1, fd);
+
+        // Failed to read header
+        if (ferror(fd)) {
+            fclose(fd);
+            return;
+        }
+
+        // Confirm that we have a bitmap file
+        if (id != bitMapID) {
+            fclose(fd);
+            return;
+        }
+
+        // Read map info header
+        fread((BitMapInfoHeader *)this, sizeof(BitMapInfoHeader), 1, fd);
+
+        // Failed to read map info header
+        if (ferror(fd)) {
+            fclose(fd);
+            return;
+        }
+
+        // No support for compressed images
+        if (compression) {
+            fclose(fd);
+            return;
+        }
+
+        // Support only 8 or 24 bits images
+        if (bitsPerPixel < 8) {
+            fclose(fd);
+            return;
+        }
+
+        // Store number of colors
+        numColors_ = 1 << bitsPerPixel;
+
+        //load the palate for 8 bits per pixel
+        if(bitsPerPixel == 8) {
+            colors_ = new ColorPalette[numColors_];
+            if (colors_ == NULL) {
+                fclose(fd);
+                return;
+            }
+            fread(
+                (char *)colors_,
+                numColors_ * sizeof(ColorPalette),
+                1,
+                fd);
+
+            // Failed to read colors
+            if (ferror(fd)) {
+                fclose(fd);
+                return;
+            }
+        }
+
+        // Allocate buffer to hold all pixels
+        unsigned int sizeBuffer = size - offset;
+        unsigned char * tmpPixels = new unsigned char[sizeBuffer];
+
+        if (tmpPixels == NULL) {
+            delete colors_;
+            colors_ = NULL;
+            fclose(fd);
+            return;
+        }
+
+        // Read pixels from file, including any padding
+        fread(tmpPixels, sizeBuffer * sizeof(unsigned char), 1, fd);
+
+        // Failed to read pixel data
+        if (ferror(fd)) {
+            delete colors_;
+            colors_ = NULL;
+            delete tmpPixels;
+            fclose(fd);
+            return;
+        }
+
+        // Allocate image
+        pixels_ = new uchar4[width * height];
+        if (pixels_ == NULL) {
+            delete colors_;
+            colors_ = NULL;
+            delete tmpPixels;
+            fclose(fd);
+            return;
+        }
+        // Set image, including w component (white)
+        memset(pixels_, 0xff, width * height * sizeof(uchar4));
+
+        unsigned int index = 0;
+        for(int y = 0; y < height; y++) {
+            for(int x = 0; x < width; x++) {
+                // Read RGB values
+                if (bitsPerPixel == 8) {
+                    pixels_[(y * width + x)] = colors_[tmpPixels[index++]];
+                }
+                else { // 24 bit
+                    pixels_[(y * width + x)].z = tmpPixels[index++];
+                    pixels_[(y * width + x)].y = tmpPixels[index++];
+                    pixels_[(y * width + x)].x = tmpPixels[index++];
+                }
+            }
+
+            // Handle padding
+            for(int x = 0; x < (4 - (3 * width) % 4) % 4; x++) {
+                index++;
+            }
+        }
+
+        // Loaded file so we can close the file.
+        fclose(fd);
+        delete[] tmpPixels;
+
+        // Loaded file so record this fact
+        isLoaded_  = true;
+    }
+}
+
+int
+BitMap::colorIndex(uchar4 color)
+{
+    for (int i = 0; i < numColors_; i++) {
+        if (colors_[i].x == color.x &&
+            colors_[i].y == color.y &&
+            colors_[i].z == color.z &&
+            colors_[i].w == color.w) {
+            return i;
+        }
+    }
+
+    return 0;
+}
+
+bool
+BitMap::write(const char * filename)
+{
+    if (!isLoaded_) {
+        return false;
+    }
+
+    // Open BMP file
+    FILE * fd = fopen(filename, "wb");
+
+    // Opened OK
+    if (fd != NULL) {
+        // Write header
+        fwrite((BitMapHeader *)this, sizeof(BitMapHeader), 1, fd);
+
+        // Failed to write header
+        if (ferror(fd)) {
+            fclose(fd);
+            return false;
+        }
+
+        // Write map info header
+        fwrite((BitMapInfoHeader *)this, sizeof(BitMapInfoHeader), 1, fd);
+
+        // Failed to write map info header
+        if (ferror(fd)) {
+            fclose(fd);
+            return false;
+        }
+
+        // Write palate for 8 bits per pixel
+        if(bitsPerPixel == 8) {
+            fwrite(
+                (char *)colors_,
+                numColors_ * sizeof(ColorPalette),
+                1,
+                fd);
+
+            // Failed to write colors
+            if (ferror(fd)) {
+                fclose(fd);
+                return false;
+            }
+        }
+
+        for(int y = 0; y < height; y++) {
+            for(int x = 0; x < width; x++) {
+                // Read RGB values
+                if (bitsPerPixel == 8) {
+                    fputc(
+                        colorIndex(
+                            pixels_[(y * width + x)]),
+                            fd);
+                }
+                else { // 24 bit
+                    fputc(pixels_[(y * width + x)].z, fd);
+                    fputc(pixels_[(y * width + x)].y, fd);
+                    fputc(pixels_[(y * width + x)].x, fd);
+
+                    if (ferror(fd)) {
+                        fclose(fd);
+                        return false;
+                    }
+                }
+            }
+
+            // Add padding
+            for(int x = 0; x < (4 - (3 * width) % 4) % 4; x++) {
+                fputc(0, fd);
+            }
+        }
+
+        return true;
+    }
+
+    return false;
+}
+
+} // amd
diff --git a/Demos/OpenCLClothDemo/bmpLoader.h b/Demos/OpenCLClothDemo/bmpLoader.h
new file mode 100644
index 000000000..301ad0d12
--- /dev/null
+++ b/Demos/OpenCLClothDemo/bmpLoader.h
@@ -0,0 +1,201 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2010 Advanced Micro Devices
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef BMPLOADER_H_
+#define BMPLOADER_H_
+
+#include <cstdlib>
+#include <iostream>
+
+namespace amd
+{
+
+//! @fixme this needs to be moved to common types header?
+#pragma pack(1)
+typedef struct
+{
+    unsigned char x;
+    unsigned char y;
+    unsigned char z;
+    unsigned char w;
+} uchar4;
+
+typedef uchar4 ColorPalette;
+
+//! \struct Bitmap header info
+typedef struct {
+    short id;
+    int size;
+    short reserved1;
+    short reserved2;
+    int offset;
+} BitMapHeader;
+
+//! \struct Bitmap info header
+typedef struct {
+    int sizeInfo;
+    int width;
+    int height;
+    short planes;
+    short bitsPerPixel;
+    unsigned compression;
+    unsigned imageSize;
+    int xPelsPerMeter;
+    int yPelsPerMeter;
+    int clrUsed;
+    int clrImportant;
+} BitMapInfoHeader;
+
+//! \class Bitmap used to load a bitmap image from a file.
+class BitMap : public BitMapHeader, public BitMapInfoHeader
+{
+private:
+    uchar4 * pixels_;
+
+    int numColors_;
+
+    ColorPalette * colors_;
+
+    bool isLoaded_;
+
+    void releaseResources(void);
+
+    int colorIndex(uchar4 color);
+public:
+
+    //! \brief Default constructor
+    BitMap()
+        : pixels_(NULL),
+          numColors_(0),
+          colors_(NULL),
+          isLoaded_(false)
+    {}
+
+    /*!\brief Constructor
+     *
+     * Tries to load bitmap image from filename provided.
+     *
+     * \param filename pointer to null terminated string that is the path and
+     * filename to the bitmap image to be loaded.
+     *
+     * In the base of an error, e.g. the bitmap file could not be loaded for
+     * some reason, then a following call to isLoaded will return false.
+     */
+    BitMap(const char * filename)
+        : pixels_(NULL),
+          numColors_(0),
+          colors_(NULL),
+          isLoaded_(false)
+    {
+        load(filename);
+    }
+
+    /*! \brief Copy constructor
+     *
+     * \param rhs is the bitmap to be copied (cloned).
+     */
+    BitMap(const BitMap& rhs)
+    {
+        *this = rhs;
+    }
+
+    //! \brief Destructor
+    ~BitMap()
+    {
+        releaseResources();
+    }
+
+    /*! \brief Assignment
+     * \param rhs is the bitmap to be assigned (cloned).
+     */
+    BitMap& operator=(const BitMap& rhs);
+
+    /*! \brief Load Bitmap image
+     *
+     * \param filename is a pointer to a null terminated string that is the
+     * path and filename name to the the bitmap file to be loaded.
+     *
+     * In the base of an error, e.g. the bitmap file could not be loaded for
+     * some reason, then a following call to isLoaded will return false.
+     */
+    void
+    load(const char * filename);
+
+    /*! \brief Write Bitmap image
+     *
+     * \param filename is a pointer to a null terminated string that is the
+     * path and filename name to the the bitmap file to be written.
+     *
+     * \return In the case that the bitmap is written true is returned. In
+     * the case that a bitmap image is not already loaded or the write fails
+     * for some reason false is returned.
+     */
+    bool
+    write(const char * filename);
+
+    /*! \brief Get image width
+     *
+     * \return If a bitmap image has been successfully loaded, then the width
+     * image is returned, otherwise -1;
+     */
+    int
+    getWidth(void) const
+    {
+        if (isLoaded_) {
+            return width;
+        }
+        else {
+            return -1;
+        }
+    }
+
+    /*! \brief Get image height
+     *
+     * \return If a bitmap image has been successfully loaded, then the height
+     * image is returned, otherwise -1.
+     */
+    int
+    getHeight(void) const
+    {
+        if (isLoaded_) {
+            return height;
+        }
+        else {
+            return -1;
+        }
+    }
+
+    /*! \brief Get image width
+     *
+     * \return If a bitmap image has been successfully loaded, then returns
+     * a pointer to image's pixels, otherwise NULL.
+     */
+    const uchar4 *
+    getPixels(void) const { return pixels_; }
+
+    /*! \brief Is an image currently loaded
+     *
+     * \return If a bitmap image has been successfully loaded, then returns
+     * true, otherwise if an image could not be loaded or an image has yet
+     * to be loaded false is returned.
+     */
+    bool
+    isLoaded(void) const { return isLoaded_; }
+};
+#pragma pack()
+}
+
+#endif // BMPLOADER_H_
diff --git a/Demos/OpenCLClothDemo/bmpLoader.hpp b/Demos/OpenCLClothDemo/bmpLoader.hpp
new file mode 100644
index 000000000..2daae0a47
--- /dev/null
+++ b/Demos/OpenCLClothDemo/bmpLoader.hpp
@@ -0,0 +1,189 @@
+//
+// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#ifndef BMPLOADER_H_
+#define BMPLOADER_H_
+
+#include <cstdlib>
+#include <iostream>
+
+namespace amd
+{
+
+//! @fixme this needs to be moved to common types header?
+#pragma pack(1)
+typedef struct
+{
+    unsigned char x;
+    unsigned char y;
+    unsigned char z;
+    unsigned char w;
+} uchar4;
+
+typedef uchar4 ColorPalette;
+
+//! \struct Bitmap header info
+typedef struct {
+    short id;
+    int size;
+    short reserved1;
+    short reserved2;
+    int offset;
+} BitMapHeader;
+
+//! \struct Bitmap info header
+typedef struct {
+    int sizeInfo;
+    int width;
+    int height;
+    short planes;
+    short bitsPerPixel;
+    unsigned compression;
+    unsigned imageSize;
+    int xPelsPerMeter;
+    int yPelsPerMeter;
+    int clrUsed;
+    int clrImportant;
+} BitMapInfoHeader;
+
+//! \class Bitmap used to load a bitmap image from a file.
+class BitMap : public BitMapHeader, public BitMapInfoHeader
+{
+private:
+    uchar4 * pixels_;
+
+    int numColors_;
+
+    ColorPalette * colors_;
+
+    bool isLoaded_;
+
+    void releaseResources(void);
+
+    int colorIndex(uchar4 color);
+public:
+
+    //! \brief Default constructor
+    BitMap()
+        : pixels_(NULL),
+          numColors_(0),
+          colors_(NULL),
+          isLoaded_(false)
+    {}
+
+    /*!\brief Constructor
+     *
+     * Tries to load bitmap image from filename provided.
+     *
+     * \param filename pointer to null terminated string that is the path and
+     * filename to the bitmap image to be loaded.
+     *
+     * In the base of an error, e.g. the bitmap file could not be loaded for
+     * some reason, then a following call to isLoaded will return false.
+     */
+    BitMap(const char * filename)
+        : pixels_(NULL),
+          numColors_(0),
+          colors_(NULL),
+          isLoaded_(false)
+    {
+        load(filename);
+    }
+
+    /*! \brief Copy constructor
+     *
+     * \param rhs is the bitmap to be copied (cloned).
+     */
+    BitMap(const BitMap& rhs)
+    {
+        *this = rhs;
+    }
+
+    //! \brief Destructor
+    ~BitMap()
+    {
+        releaseResources();
+    }
+
+    /*! \brief Assignment
+     * \param rhs is the bitmap to be assigned (cloned).
+     */
+    BitMap& operator=(const BitMap& rhs);
+
+    /*! \brief Load Bitmap image
+     *
+     * \param filename is a pointer to a null terminated string that is the
+     * path and filename name to the the bitmap file to be loaded.
+     *
+     * In the base of an error, e.g. the bitmap file could not be loaded for
+     * some reason, then a following call to isLoaded will return false.
+     */
+    void
+    load(const char * filename);
+
+    /*! \brief Write Bitmap image
+     *
+     * \param filename is a pointer to a null terminated string that is the
+     * path and filename name to the the bitmap file to be written.
+     *
+     * \return In the case that the bitmap is written true is returned. In
+     * the case that a bitmap image is not already loaded or the write fails
+     * for some reason false is returned.
+     */
+    bool
+    write(const char * filename);
+
+    /*! \brief Get image width
+     *
+     * \return If a bitmap image has been successfully loaded, then the width
+     * image is returned, otherwise -1;
+     */
+    int
+    getWidth(void) const
+    {
+        if (isLoaded_) {
+            return width;
+        }
+        else {
+            return -1;
+        }
+    }
+
+    /*! \brief Get image height
+     *
+     * \return If a bitmap image has been successfully loaded, then the height
+     * image is returned, otherwise -1.
+     */
+    int
+    getHeight(void) const
+    {
+        if (isLoaded_) {
+            return height;
+        }
+        else {
+            return -1;
+        }
+    }
+
+    /*! \brief Get image width
+     *
+     * \return If a bitmap image has been successfully loaded, then returns
+     * a pointer to image's pixels, otherwise NULL.
+     */
+    const uchar4 *
+    getPixels(void) const { return pixels_; }
+
+    /*! \brief Is an image currently loaded
+     *
+     * \return If a bitmap image has been successfully loaded, then returns
+     * true, otherwise if an image could not be loaded or an image has yet
+     * to be loaded false is returned.
+     */
+    bool
+    isLoaded(void) const { return isLoaded_; }
+};
+#pragma pack()
+}
+
+#endif // BMPLOADER_H_
diff --git a/Demos/OpenCLClothDemo/btOpenCLSupport.h b/Demos/OpenCLClothDemo/btOpenCLSupport.h
new file mode 100644
index 000000000..5b03e14c5
--- /dev/null
+++ b/Demos/OpenCLClothDemo/btOpenCLSupport.h
@@ -0,0 +1,84 @@
+#ifndef BT_OPENCL_SUPPORT_HPP
+#define BT_OPENCL_SUPPORT_HPP
+
+// OpenCL support
+#include <CL/cl.hpp>
+
+namespace BTAcceleratedSoftBody
+{
+	class OpenCLSupportHelper
+	{
+	private:
+		cl::Context m_context;
+		std::vector<cl::Device> m_devices;
+		cl::CommandQueue m_queue;
+	public:
+		OpenCLSupportHelper()
+		{
+		}
+
+		virtual ~OpenCLSupportHelper()
+		{
+		}
+
+		cl::Device getDevice()
+		{
+			return m_devices[0];
+		}
+
+		cl::CommandQueue getCommandQueue()
+		{
+			return m_queue;
+		}
+
+		cl::Context getContext()
+		{
+			return m_context;
+		}
+
+		bool InitOpenCLDevice()
+		{
+			cl_int err;
+
+			std::vector<cl::Platform> platforms;
+			err = cl::Platform::get(&platforms);
+			checkErr(platforms.size() != 0 ? CL_SUCCESS : -1, "Platform::get()");
+
+			std::string platformVendor;
+			platforms[0].getInfo(CL_PLATFORM_VENDOR, &platformVendor);
+			//std::cout << "Platform is by: " << platformVendor << "\n";
+
+			intptr_t properties[] = {
+				CL_CONTEXT_PLATFORM, (intptr_t)platforms[0](),
+				0, 0
+			};
+			m_context = cl::Context(
+				CL_DEVICE_TYPE_GPU, 
+				properties, 
+				NULL, 
+				NULL, 
+				&err);
+
+			if (err != CL_SUCCESS)
+			{
+				btAssert( "Context::Context()" );
+			}
+
+			m_devices = m_context.getInfo<CL_CONTEXT_DEVICES>();
+			if( m_devices.size() <= 0 ) 
+			{
+				btAssert( "devices.size() > 0" );
+			}
+		
+			m_queue = cl::CommandQueue(m_context, m_devices[0], 0, &err);
+		    if (err != CL_SUCCESS) 
+			{
+				btAssert( "CommandQueue::CommandQueue()");
+			}
+		}
+	};
+
+
+} // namespace BTAcceleratedSoftBody
+
+#endif // #ifndef BT_OPENCL_SUPPORT_HPP
\ No newline at end of file
diff --git a/Demos/OpenCLClothDemo/cl_cloth_demo.cpp b/Demos/OpenCLClothDemo/cl_cloth_demo.cpp
new file mode 100644
index 000000000..b7e22c714
--- /dev/null
+++ b/Demos/OpenCLClothDemo/cl_cloth_demo.cpp
@@ -0,0 +1,470 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2008 Advanced Micro Devices
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifdef _WIN32
+#include <GL/glew.h>
+#endif
+
+#include "clstuff.h"
+#include "gl_win.h"
+#include "cloth.h"
+
+#define USE_GPU_SOLVER
+
+
+const int numFlags = 5;
+const int clothWidth = 40;
+const int clothHeight = 60;//60;
+float _windAngle = 1.0;//0.4;
+float _windStrength = 15;
+
+
+
+#include <iostream>
+using namespace std;
+
+
+
+
+#include "btBulletDynamicsCommon.h"
+#include "LinearMath/btHashMap.h"
+#include "BulletSoftBody/btSoftRigidDynamicsWorld.h"
+#include "vectormath/vmInclude.h"
+#include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolver_CPU.h"
+#include "BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h"
+
+using Vectormath::Aos::Vector3;
+
+class piece_of_cloth;
+class btBroadphaseInterface;
+class btCollisionShape;
+class btOverlappingPairCache;
+class btCollisionDispatcher;
+class btConstraintSolver;
+struct btCollisionAlgorithmCreateFunc;
+class btDefaultCollisionConfiguration;
+
+namespace Vectormath
+{
+	namespace Aos
+	{
+		class Transform3;
+	}
+}
+
+
+btAlignedObjectArray<btCollisionShape*>	m_collisionShapes;
+btBroadphaseInterface*	m_broadphase;
+btCollisionDispatcher*	m_dispatcher;
+btConstraintSolver*	m_solver;
+btDefaultCollisionConfiguration* m_collisionConfiguration;
+
+btCPUSoftBodySolver *g_cpuSolver = NULL;
+btOpenCLSoftBodySolver *g_openCLSolver = NULL;
+
+btSoftBodySolver *g_solver = NULL;
+
+btAlignedObjectArray<btSoftBody *> m_flags;
+btSoftRigidDynamicsWorld* m_dynamicsWorld;
+btAlignedObjectArray<piece_of_cloth> cloths;
+
+extern cl_context			g_cxMainContext;
+extern cl_device_id		g_cdDevice;
+extern cl_command_queue	g_cqCommandQue;
+
+
+const float flagSpacing = 30.f;
+
+
+// Helper to test and add links correctly.
+// Records links that have already been generated
+static bool testAndAddLink( btAlignedObjectArray<int> &trianglesForLinks, btSoftBody *softBody, int triangle, int *triangleVertexIndexArray, int numVertices, int vertex0, int vertex1, int nonLinkVertex, btSoftBody::Material *structuralMaterial, bool createBendLinks, btSoftBody::Material *bendMaterial )
+{		
+	if( trianglesForLinks[ numVertices * vertex0 + vertex1 ] >= 0 && createBendLinks)
+	{
+		// Already have link so find other triangle and generate cross link
+
+		int otherTriangle = trianglesForLinks[numVertices * vertex0 + vertex1];
+		int otherIndices[3] = {triangleVertexIndexArray[otherTriangle * 3], triangleVertexIndexArray[otherTriangle * 3 + 1], triangleVertexIndexArray[otherTriangle * 3 + 2]};
+
+		int nodeA;
+		// Test all links of the other triangle against this link. The one that's not part of it is what we want.
+		if( otherIndices[0] != vertex0 && otherIndices[0] != vertex1 )
+			nodeA = otherIndices[0];
+		if( otherIndices[1] != vertex0 && otherIndices[1] != vertex1 )
+			nodeA = otherIndices[1];
+		if( otherIndices[2] != vertex0 && otherIndices[2] != vertex1 )
+			nodeA = otherIndices[2];
+
+		softBody->appendLink( nodeA, nonLinkVertex, bendMaterial );
+	} else {
+		// Don't yet have link so create it
+		softBody->appendLink( vertex0, vertex1, structuralMaterial );
+
+		// If we added a new link, set the triangle array
+		trianglesForLinks[numVertices * vertex0 + vertex1] = triangle;
+		trianglesForLinks[numVertices * vertex1 + vertex0] = triangle;
+
+	}
+
+	return true;
+}
+
+btSoftBody *createFromIndexedMesh( btVector3 *vertexArray, int numVertices, int *triangleVertexIndexArray, int numTriangles, bool createBendLinks )
+{
+	btSoftBody* softBody = new btSoftBody(&(m_dynamicsWorld->getWorldInfo()), numVertices, vertexArray, 0);
+	btSoftBody::Material * structuralMaterial = softBody->appendMaterial();
+	btSoftBody::Material * bendMaterial;
+	if( createBendLinks )
+	{
+		bendMaterial = softBody->appendMaterial();
+		bendMaterial->m_kLST = 0.7;
+	} else {
+		bendMaterial = NULL;
+	}
+	structuralMaterial->m_kLST = 1.0;
+	
+
+	// List of values for each link saying which triangle is associated with that link
+	// -1 to start. Once a value is entered we know the "other" triangle
+	// and can add a link across the link
+	btAlignedObjectArray<int> triangleForLinks;
+	triangleForLinks.resize( numVertices * numVertices, -1 );
+	int numLinks = 0;
+	for( int triangle = 0; triangle < numTriangles; ++triangle )
+	{
+		int index[3] = {triangleVertexIndexArray[triangle * 3], triangleVertexIndexArray[triangle * 3 + 1], triangleVertexIndexArray[triangle * 3 + 2]};
+		softBody->appendFace( index[0], index[1], index[2] );
+		
+		// Generate the structural links directly from the triangles
+		testAndAddLink( triangleForLinks, softBody, triangle, triangleVertexIndexArray, numVertices, index[0], index[1], index[2], structuralMaterial, createBendLinks, bendMaterial );
+		testAndAddLink( triangleForLinks, softBody, triangle, triangleVertexIndexArray, numVertices, index[1], index[2], index[0], structuralMaterial, createBendLinks, bendMaterial );
+		testAndAddLink( triangleForLinks, softBody, triangle, triangleVertexIndexArray, numVertices, index[2], index[0], index[1], structuralMaterial, createBendLinks, bendMaterial);
+	}
+
+	return softBody;
+}
+
+/**
+ * Create a sequence of flag objects and add them to the world.
+ */
+void createFlag( btSoftBodySolver &solver, int width, int height, btAlignedObjectArray<btSoftBody *> &flags )
+{
+	// First create a triangle mesh to represent a flag
+
+	using Vectormath::Aos::Matrix3;
+	using Vectormath::Aos::Vector3;
+
+	// Allocate a simple mesh consisting of a vertex array and a triangle index array
+	btIndexedMesh mesh;
+	mesh.m_numVertices = width*height;
+	mesh.m_numTriangles = 2*(width-1)*(height-1);
+
+	btVector3 *vertexArray = new btVector3[mesh.m_numVertices];
+
+	mesh.m_vertexBase = reinterpret_cast<const unsigned char*>(vertexArray);
+	int *triangleVertexIndexArray = new int[3*mesh.m_numTriangles];	
+	mesh.m_triangleIndexBase = reinterpret_cast<const unsigned char*>(triangleVertexIndexArray);
+	mesh.m_triangleIndexStride = sizeof(int)*3;
+	mesh.m_vertexStride = sizeof(Vector3);
+
+	// Generate normalised object space vertex coordinates for a rectangular flag
+	float zCoordinate = 0.0f;
+	
+	Matrix3 defaultScale(Vector3(5.f, 0.f, 0.f), Vector3(0.f, 20.f, 0.f), Vector3(0.f, 0.f, 1.f));
+	for( int y = 0; y < height; ++y )
+	{
+		float yCoordinate = y*2.0f/float(height) - 1.0f;
+		for( int x = 0; x < width; ++x )
+		{			
+			float xCoordinate = x*2.0f/float(width) - 1.0f;
+
+			Vector3 vertex(xCoordinate, yCoordinate, zCoordinate);
+			Vector3 transformedVertex = defaultScale*vertex;
+
+			vertexArray[y*width + x] = btVector3(transformedVertex.getX(), transformedVertex.getY(), transformedVertex.getZ() );
+
+		}
+	}
+
+	// Generate vertex indices for triangles
+	for( int y = 0; y < (height-1); ++y )
+	{
+		for( int x = 0; x < (width-1); ++x )
+		{	
+			// Triangle 0
+			// Top left of square on mesh
+			{
+				int vertex0 = y*width + x;
+				int vertex1 = vertex0 + 1;
+				int vertex2 = vertex0 + width;
+				int triangleIndex = 2*y*(width-1) + 2*x;
+				triangleVertexIndexArray[(mesh.m_triangleIndexStride*triangleIndex)/sizeof(int)] = vertex0;
+				triangleVertexIndexArray[(mesh.m_triangleIndexStride*triangleIndex+1)/sizeof(int)+1] = vertex1;
+				triangleVertexIndexArray[(mesh.m_triangleIndexStride*triangleIndex+2)/sizeof(int)+2] = vertex2;
+			}
+
+			// Triangle 1
+			// Bottom right of square on mesh
+			{
+				int vertex0 = y*width + x + 1;
+				int vertex1 = vertex0 + width;
+				int vertex2 = vertex1 - 1;
+				int triangleIndex = 2*y*(width-1) + 2*x + 1;
+				triangleVertexIndexArray[(mesh.m_triangleIndexStride*triangleIndex)/sizeof(int)] = vertex0;
+				triangleVertexIndexArray[(mesh.m_triangleIndexStride*triangleIndex)/sizeof(int)+1] = vertex1;
+				triangleVertexIndexArray[(mesh.m_triangleIndexStride*triangleIndex)/sizeof(int)+2] = vertex2;
+			}
+		}
+	}
+
+	
+	float rotateAngleRoundZ = 0.5;
+	float rotateAngleRoundX = 0.5;
+	btMatrix3x3 defaultRotate;
+	defaultRotate[0] = btVector3(cos(rotateAngleRoundZ), sin(rotateAngleRoundZ), 0.f); 
+	defaultRotate[1] = btVector3(-sin(rotateAngleRoundZ), cos(rotateAngleRoundZ), 0.f);
+	defaultRotate[2] = btVector3(0.f, 0.f, 1.f);
+	btMatrix3x3 defaultRotateX;
+	defaultRotateX[0] = btVector3(1.f, 0.f, 0.f);
+	defaultRotateX[1] = btVector3( 0.f, cos(rotateAngleRoundX), sin(rotateAngleRoundX));
+	defaultRotateX[2] = btVector3(0.f, -sin(rotateAngleRoundX), cos(rotateAngleRoundX));
+
+	btMatrix3x3 defaultRotateAndScale( (defaultRotateX*defaultRotate) );
+
+
+	// Construct the sequence flags applying a slightly different translation to each one to arrange them
+	// appropriately in the scene.
+	for( int i = 0; i < numFlags; ++i )
+	{
+		float zTranslate = flagSpacing * (i-numFlags/2);
+
+		btVector3 defaultTranslate(0.f, 20.f, zTranslate);
+
+		btTransform transform( defaultRotateAndScale, defaultTranslate );
+
+
+		btSoftBody *softBody = createFromIndexedMesh( vertexArray, mesh.m_numVertices, triangleVertexIndexArray, mesh.m_numTriangles, true );
+
+
+		for( int i = 0; i < mesh.m_numVertices; ++i )
+		{
+			softBody->setMass(i, 10.f/mesh.m_numVertices);
+		}
+		softBody->setMass((height-1)*(width), 0.f);
+		softBody->setMass((height-1)*(width) + width - 1, 0.f);
+		softBody->setMass((height-1)*width + width/2, 0.f);
+		softBody->m_cfg.collisions = btSoftBody::fCollision::CL_SS+btSoftBody::fCollision::CL_RS;	
+		
+		
+		flags.push_back( softBody );
+
+		softBody->transform( transform );
+		
+		m_dynamicsWorld->addSoftBody( softBody );
+	}
+
+	delete [] vertexArray;
+	delete [] triangleVertexIndexArray;
+}
+
+
+void updatePhysicsWorld()
+{
+	static int counter = 0;
+
+	// Change wind velocity a bit based on a frame counter
+	if( (counter % 400) == 0 )
+	{
+		_windAngle = (_windAngle + 0.05f);
+		if( _windAngle > (2*3.141) )
+			_windAngle = 0;
+
+		for( int flagIndex = 0; flagIndex < m_flags.size(); ++flagIndex )
+		{		
+			btSoftBody *cloth = 0;
+
+			cloth = m_flags[flagIndex];
+
+			float localWind = _windAngle + 0.5*(((float(rand())/RAND_MAX))-0.1);
+			float xCoordinate = cos(localWind)*_windStrength;
+			float zCoordinate = sin(localWind)*_windStrength;
+
+			cloth->setWindVelocity( btVector3(xCoordinate, 0, zCoordinate) );
+		}
+	}
+
+	//btVector3 origin( capCollider->getWorldTransform().getOrigin() );
+	//origin.setX( origin.getX() + 0.05 );
+	//capCollider->getWorldTransform().setOrigin( origin );
+	
+	counter++;
+}
+
+void initBullet(void)
+{
+
+#ifdef USE_GPU_SOLVER
+	g_openCLSolver = new btOpenCLSoftBodySolver( g_cqCommandQue, g_cxMainContext);
+	g_solver = g_openCLSolver;
+#else
+	g_cpuSolver = new btCPUSoftBodySolver;
+	g_solver = g_cpuSolver;
+#endif
+
+	m_collisionConfiguration = new btDefaultCollisionConfiguration();
+	m_dispatcher = new	btCollisionDispatcher(m_collisionConfiguration);
+	m_broadphase = new btDbvtBroadphase();
+	btSequentialImpulseConstraintSolver* sol = new btSequentialImpulseConstraintSolver;
+	m_solver = sol;
+
+	m_dynamicsWorld = new btSoftRigidDynamicsWorld(m_dispatcher, m_broadphase, m_solver, m_collisionConfiguration, g_solver);	
+
+	m_dynamicsWorld->setGravity(btVector3(0,-10,0));	
+	btCollisionShape* groundShape = new btBoxShape(btVector3(btScalar(50.),btScalar(50.),btScalar(50.)));	
+	m_collisionShapes.push_back(groundShape);
+	btTransform groundTransform;
+	groundTransform.setIdentity();
+	groundTransform.setOrigin(btVector3(0,-50,0));
+
+	
+
+
+
+
+	m_dynamicsWorld->getWorldInfo().air_density			=	(btScalar)1.2;
+	m_dynamicsWorld->getWorldInfo().water_density		=	0;
+	m_dynamicsWorld->getWorldInfo().water_offset		=	0;
+	m_dynamicsWorld->getWorldInfo().water_normal		=	btVector3(0,0,0);
+	m_dynamicsWorld->getWorldInfo().m_gravity.setValue(0,-10,0);
+
+
+
+#if 0
+	{
+		btScalar mass(0.);
+
+		//rigidbody is dynamic if and only if mass is non zero, otherwise static
+		bool isDynamic = (mass != 0.f);
+
+		btVector3 localInertia(0,0,0);
+		if (isDynamic)
+			groundShape->calculateLocalInertia(mass,localInertia);
+
+		//using motionstate is recommended, it provides interpolation capabilities, and only synchronizes 'active' objects
+		btDefaultMotionState* myMotionState = new btDefaultMotionState(groundTransform);
+		btRigidBody::btRigidBodyConstructionInfo rbInfo(mass,myMotionState,groundShape,localInertia);
+		btRigidBody* body = new btRigidBody(rbInfo);
+
+		//add the body to the dynamics world
+		m_dynamicsWorld->addRigidBody(body);
+	}
+ 
+#endif
+
+#ifdef USE_GPU_SOLVER
+	createFlag( *g_openCLSolver, clothWidth, clothHeight, m_flags );
+#else
+	createFlag( *g_cpuSolver, clothWidth, clothHeight, m_flags );
+#endif
+
+	// Create output buffer descriptions for ecah flag
+	// These describe where the simulation should send output data to
+	for( int flagIndex = 0; flagIndex < m_flags.size(); ++flagIndex )
+	{		
+//		m_flags[flagIndex]->setWindVelocity( Vectormath::Aos::Vector3( 0.f, 0.f, 15.f ) );
+		
+		// In this case we have a DX11 output buffer with a vertex at index 0, 8, 16 and so on as well as a normal at 3, 11, 19 etc.
+		// Copies will be performed GPU-side directly into the output buffer
+
+		btCPUVertexBufferDescriptor *vertexBufferDescriptor = new btCPUVertexBufferDescriptor(reinterpret_cast< float* >(cloths[flagIndex].cpu_buffer), 0, 8, 3, 8);
+		cloths[flagIndex].m_vertexBufferDescriptor = vertexBufferDescriptor;
+	}
+
+
+	g_solver->optimize( m_dynamicsWorld->getSoftBodyArray() );
+
+}
+
+
+
+
+btClock m_clock;
+
+void doFlags()
+{
+	//float ms = getDeltaTimeMicroseconds();
+	btScalar dt = (btScalar)m_clock.getTimeMicroseconds();
+	m_clock.reset();
+
+	///step the simulation
+	if( m_dynamicsWorld )
+	{
+		m_dynamicsWorld->stepSimulation(dt/1000000.);
+		static int frameCount = 0;
+		frameCount++;
+		if (frameCount==100)
+		{
+ 			m_dynamicsWorld->stepSimulation(1./60.,0);
+			CProfileManager::dumpAll();
+		}
+		updatePhysicsWorld();
+	}
+	
+
+	for( int flagIndex = 0; flagIndex < m_flags.size(); ++flagIndex )
+	{
+		g_solver->copySoftBodyToVertexBuffer( m_flags[flagIndex], cloths[flagIndex].m_vertexBufferDescriptor );
+		cloths[flagIndex].draw();
+	}
+}
+
+
+int main(int argc, char *argv[])
+{
+
+
+	initCL();
+
+	cloths.resize(numFlags);
+
+	for( int flagIndex =  0; flagIndex < numFlags; ++flagIndex )
+	{
+		cloths[flagIndex].create_buffers(clothWidth, clothHeight);
+	}
+
+	initBullet();
+	m_dynamicsWorld->stepSimulation(1./60.,0);
+
+	preInitGL(argc, argv);
+
+	std::string flagTexs[] = {
+		"atiFlag.bmp",
+		"amdFlag.bmp",
+	};
+	int numFlagTexs = 2;
+
+	for( int flagIndex =  0; flagIndex < numFlags; ++flagIndex )
+	{
+		cloths[flagIndex].create_texture(flagTexs[flagIndex % numFlagTexs]);
+		cloths[flagIndex].x_offset = 0; 
+		cloths[flagIndex].y_offset = 0; 
+		cloths[flagIndex].z_offset = 0;
+	}
+
+	goGL();
+ 	return 0;
+}
+
diff --git a/Demos/OpenCLClothDemo/cloth.h b/Demos/OpenCLClothDemo/cloth.h
new file mode 100644
index 000000000..cce971b22
--- /dev/null
+++ b/Demos/OpenCLClothDemo/cloth.h
@@ -0,0 +1,183 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2008 Advanced Micro Devices
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "gl_win.h" //for OpenGL stuff
+
+#include "bmpLoader.h"
+#include <string>
+#include "LinearMath/btScalar.h"
+
+
+struct vertex_struct 
+{
+	float pos[3];
+	float normal[3];
+	float texcoord[2];
+
+};
+
+class btVertexBufferDescriptor;
+
+class piece_of_cloth 
+{
+	public:
+
+	void destroy(void)
+	{
+		if(created)
+		{
+			if(cpu_buffer) delete [] cpu_buffer;
+		}
+	}
+
+	piece_of_cloth()
+	{
+		created = false;
+		cpu_buffer = NULL;
+		m_vertexBufferDescriptor = NULL;
+	}
+
+	bool created;
+
+	vertex_struct* cpu_buffer;
+	unsigned int* indices;
+	btVertexBufferDescriptor *m_vertexBufferDescriptor;
+
+	double x_offset, y_offset, z_offset;
+
+	int width;
+	int height;
+
+	GLuint texture;
+
+	void draw(void)
+	{
+		glEnable(GL_TEXTURE_2D);
+		glBindTexture (GL_TEXTURE_2D, texture);
+
+		glEnable(GL_DEPTH_TEST);
+
+		glColor3f(0.0f, 1.0f, 1.0f);
+
+		glEnableClientState(GL_VERTEX_ARRAY);
+		//glEnableClientState(GL_NORMAL_ARRAY);
+		glEnableClientState(GL_TEXTURE_COORD_ARRAY);
+
+		glBindTexture(GL_TEXTURE_2D, texture);
+
+		glVertexPointer( 3, GL_FLOAT, sizeof(vertex_struct), reinterpret_cast< GLvoid* >(&(cpu_buffer[0].pos[0])) );
+		//glNormalPointer( 3, sizeof(vertex_struct), reinterpret_cast< GLvoid* >(&(cpu_buffer[0].normal[0])) );
+		glTexCoordPointer( 2, GL_FLOAT, sizeof(vertex_struct), reinterpret_cast< GLvoid* >(&(cpu_buffer[0].texcoord[0])) );
+
+		glDrawElements(GL_TRIANGLES, (height-1  )*(width-1)*3*2, GL_UNSIGNED_INT, indices);
+//		glDisableClientState(GL_NORMAL_ARRAY);
+		glDisableClientState(GL_VERTEX_ARRAY);
+		glDisableClientState(GL_TEXTURE_COORD_ARRAY);
+
+		glBindTexture(GL_TEXTURE_2D, 0);
+	}
+
+	void create_texture(std::string filename)
+	{
+		amd::BitMap texBMP(filename.c_str());
+		if ( texBMP.isLoaded() ) {
+			glEnable(GL_TEXTURE_2D);
+			glGenTextures(1, &texture);
+
+			glBindTexture(GL_TEXTURE_2D, texture);
+
+			glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+
+			glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+			glTexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_DECAL);
+    
+			glTexImage2D(
+				GL_TEXTURE_2D,
+				0,
+				GL_RGBA8,
+				texBMP.getWidth(),
+				texBMP.getHeight(),
+				0,
+				GL_RGBA,
+				GL_UNSIGNED_BYTE,
+				texBMP.getPixels());
+
+			glBindTexture(GL_TEXTURE_2D, 0);
+		}
+		else {
+			std::cout << "ERROR: could not load bitmap " << "texture.bmp" << std::endl;
+			exit(1);
+		}
+	}
+
+	void create_buffers(int width_, int height_)
+	{	    
+		width = width_;
+		height = height_;
+		
+		created = true;
+
+		cpu_buffer = new vertex_struct[width*height];
+		memset(cpu_buffer, 0, width*height*sizeof(vertex_struct));
+
+
+		// Initial test data for rendering
+		for(int y = 0; y < height; y++)
+		{
+			for(int x = 0; x < width; x++)
+			{
+				double coord = btSin(x/5.0)*0.01;
+				//coord = sin(y/);
+
+				cpu_buffer[y*width+x].pos[0]      = (x/((float)(width-1)))*1;
+				cpu_buffer[y*width+x].pos[1]      = coord;
+				cpu_buffer[y*width+x].pos[2]      = (y/((float)(height-1)))*1; 
+				cpu_buffer[y*width+x].normal[0]   = 1;
+				cpu_buffer[y*width+x].normal[1]   = 0;
+				cpu_buffer[y*width+x].normal[2]   = 0;
+				cpu_buffer[y*width+x].texcoord[0] = x/((float)(width-1));
+				cpu_buffer[y*width+x].texcoord[1] = y/((float)(height-1));
+			}
+		}
+
+
+		// Generate and fill index array for rendering
+		indices = new unsigned int[width*3*2+2 + height*width*3*2];
+
+		for(int y = 0; y < height-1; y++)
+		{
+			for(int x = 0; x < width-1; x++)
+			{
+				// *3 indices/triangle, *2 triangles/quad
+				int baseIndex = (x + y*(width-1))*3*2;
+				indices[baseIndex] = x + y*width;
+				indices[baseIndex+1] = x+1 + y*width;
+				indices[baseIndex+2] = x+width + y*width;
+
+
+				indices[baseIndex+3] = x + 1 +  y*width;
+				indices[baseIndex+4] = x+(width+1) + y*width;
+				indices[baseIndex+5] = x+width + y*width;
+			}
+		}
+	}
+};
diff --git a/Demos/OpenCLClothDemo/clstuff.cpp b/Demos/OpenCLClothDemo/clstuff.cpp
new file mode 100644
index 000000000..0ee8b1ad4
--- /dev/null
+++ b/Demos/OpenCLClothDemo/clstuff.cpp
@@ -0,0 +1,53 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2008 Advanced Micro Devices
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#include "clstuff.h"
+#include "gl_win.h"
+
+
+#include "btOclCommon.h"
+#include "btOclUtils.h"
+#include "LinearMath/btScalar.h"
+
+cl_context			g_cxMainContext;
+cl_device_id		g_cdDevice;
+cl_command_queue	g_cqCommandQue;
+
+void initCL(void)
+{
+	int ciErrNum = 0;
+    //g_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum);
+	//g_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum);
+	//g_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_CPU, &ciErrNum);
+	//try CL_DEVICE_TYPE_DEBUG for sequential, non-threaded execution, when using MiniCL on CPU, it gives a full callstack at the crash in the kernel
+//#ifdef USE_MINICL
+//	g_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_DEBUG, &ciErrNum);
+//#else
+	g_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum);
+//#endif
+	
+
+	
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	g_cdDevice = btOclGetMaxFlopsDev(g_cxMainContext);
+	
+	btOclPrintDevInfo(g_cdDevice);
+
+	// create a command-queue
+	g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, g_cdDevice, 0, &ciErrNum);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+}
diff --git a/Demos/OpenCLClothDemo/clstuff.h b/Demos/OpenCLClothDemo/clstuff.h
new file mode 100644
index 000000000..09f6313eb
--- /dev/null
+++ b/Demos/OpenCLClothDemo/clstuff.h
@@ -0,0 +1,10 @@
+#ifndef __CLSTUFF_HDR__
+#define __CLSTUFF_HDR__
+
+
+
+
+
+void initCL(void);
+
+#endif //__CLSTUFF_HDR__
\ No newline at end of file
diff --git a/Demos/OpenCLClothDemo/clstuff.hpp b/Demos/OpenCLClothDemo/clstuff.hpp
new file mode 100644
index 000000000..09f6313eb
--- /dev/null
+++ b/Demos/OpenCLClothDemo/clstuff.hpp
@@ -0,0 +1,10 @@
+#ifndef __CLSTUFF_HDR__
+#define __CLSTUFF_HDR__
+
+
+
+
+
+void initCL(void);
+
+#endif //__CLSTUFF_HDR__
\ No newline at end of file
diff --git a/Demos/OpenCLClothDemo/fragment.glsl b/Demos/OpenCLClothDemo/fragment.glsl
new file mode 100644
index 000000000..6a265d348
--- /dev/null
+++ b/Demos/OpenCLClothDemo/fragment.glsl
@@ -0,0 +1,7 @@
+uniform sampler2D tex;
+	
+void main()
+{
+	vec4 color   = texture2D(tex,gl_TexCoord[0].st);
+	gl_FragColor = color;
+}
\ No newline at end of file
diff --git a/Demos/OpenCLClothDemo/gl_win.cpp b/Demos/OpenCLClothDemo/gl_win.cpp
new file mode 100644
index 000000000..1826330a3
--- /dev/null
+++ b/Demos/OpenCLClothDemo/gl_win.cpp
@@ -0,0 +1,272 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2008 Advanced Micro Devices
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "clstuff.h"
+#include "gl_win.h"
+
+#include <cstdlib>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <math.h>
+#include <cmath>
+#include <cstring>
+
+
+//#ifndef _WIN32 && !defined(__APPLE__)
+//#include <GL/glx.h>
+//#endif //!_WIN32
+
+
+
+static GLuint vbo = 0;
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+
+static unsigned int windowWidth  = 1280;
+static unsigned int windowHeight = 1024;
+
+// mouse controls
+int mouseOldX;
+int mouseOldY;
+int mouseButtons         = 0;
+
+float rotateX;
+float rotateY;
+
+float translateZ;
+float translateX;
+float translateY;
+
+static GLuint glProgram;
+
+
+void doFlags();
+
+
+void render( void)
+{
+	glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
+//	glDisable ( GL_CULL_FACE );
+
+	glMatrixMode( GL_MODELVIEW );
+	glLoadIdentity();
+
+	glTranslatef( translateX, translateY, translateZ );
+	glRotatef( rotateX, 0.5f , 0.0f, 0.0f );
+	glRotatef( rotateY, 0.0f, 0.5f, 0.0f );
+
+//	glDisable (GL_BLEND);
+
+	doFlags();
+	// TODO:
+	//glBindBuffer(GL_ARRAY_BUFFER, vbo);
+	//glVertexPointer(4, GL_FLOAT, 0, NULL);
+    //glEnableClientState(GL_VERTEX_ARRAY);
+
+	//glDrawArrays(GL_POINTS, 0, 4*4);
+
+//	glDisableClientState(GL_VERTEX_ARRAY);
+ //   glBindBuffer(GL_ARRAY_BUFFER, 0);
+
+
+//	glUseProgram(0);
+}
+
+static void initGL(void) 
+{
+	//glClearColor( 0.05f, 0.0f, 0.1f, 0.1f );
+	glClearColor(  0.0f, 0.45f, 0.45f, 1.f);
+
+#if 0
+	GLfloat mat_specular[] = { 1.0f, 1.0f, 1.0f, 1.0f };
+	GLfloat mat_shininess[] = { 50.0f };
+	GLfloat light_position[] = { 
+	   -10.f, 
+	   5.f, 
+	   -1.f, 
+	   1.0f };
+
+	glEnable ( GL_COLOR_MATERIAL );
+	glShadeModel( GL_SMOOTH );
+	glEnable( GL_LINE_SMOOTH );
+
+
+	glMaterialfv( GL_FRONT, GL_SPECULAR, mat_specular );
+	glMaterialfv( GL_FRONT, GL_SHININESS, mat_shininess );
+	glLightfv( GL_LIGHT0, GL_POSITION, light_position );
+
+	//glEnable( GL_LIGHTING );
+	//glEnable( GL_LIGHT0 ); // Switch on and crashes!
+	glEnable( GL_DEPTH_TEST );
+#endif 
+#if 0
+
+
+   glEnable ( GL_COLOR_MATERIAL );
+   glShadeModel( GL_SMOOTH );
+   glEnable( GL_LINE_SMOOTH );
+
+   glMaterialfv( GL_FRONT, GL_SPECULAR, mat_specular );
+   glMaterialfv( GL_FRONT, GL_SHININESS, mat_shininess );
+   glLightfv( GL_LIGHT0, GL_POSITION, light_position );
+
+   glEnable( GL_LIGHTING );
+   glEnable( GL_LIGHT0 );
+   glEnable( GL_DEPTH_TEST );
+#endif
+   rotateX    = 0;
+   rotateY    = 30;
+   translateX = 0.0f;
+   translateY = -30.0f;
+   translateZ = -120.0;
+}
+
+void display(void)
+{
+	render();
+
+	glutSwapBuffers();
+	glutPostRedisplay();
+}
+
+void keyboard( unsigned char key, int /*x*/, int /*y*/)
+{
+  switch( key) {
+  case('q') :
+#ifdef _WIN32
+  case VK_ESCAPE:
+#endif //_WIN32
+    exit(0);
+  break;
+  case('a'):
+    translateY += 0.1f;
+    break;
+  case('z'):
+    translateY -= 0.1f;
+    break;
+  case('d'):
+    translateX += 0.1f;
+    break;
+  case('s'):
+    translateX -= 0.1f;
+    break;
+  case('f'):
+    translateZ += 0.1f;
+    break;
+  case('g'):
+    translateZ -= 0.1f;
+    break;
+  }
+}
+
+void mouse(int button, int state, int x, int y)
+{
+  if (state == GLUT_DOWN) {
+    mouseButtons |= 1<<button;
+  } else if (state == GLUT_UP) {
+    mouseButtons = 0;
+  }
+
+  mouseOldX = x;
+  mouseOldY = y;
+  glutPostRedisplay();
+}
+
+void motion(int x, int y)
+{
+  float dx, dy;
+  dx = x - mouseOldX;
+  dy = y - mouseOldY;
+  
+  if (mouseButtons & 1) {
+    rotateX += dy * 0.2;
+    rotateY += dx * 0.2;
+  } 
+  else if (mouseButtons & 5) {
+    translateY -= dy * 0.01;
+    translateX -= dx * 0.01;
+  }
+  else if (mouseButtons & 4) {
+    translateZ += dy * 0.01;
+  } 
+
+  mouseOldX = x;
+  mouseOldY = y;
+}
+
+
+void reshape (int w, int h)
+{
+	windowWidth  = w;
+	windowHeight = h;
+	glViewport(0, 0, windowWidth, windowHeight);
+
+	glMatrixMode(GL_PROJECTION);
+	glLoadIdentity();
+	gluPerspective(
+		60.0,
+		(GLfloat)windowWidth / (GLfloat) windowHeight,
+		0.1,
+		600.0f );
+}
+
+void goGL(void)
+{
+	glutMainLoop();
+}
+
+void preInitGL(int argc, char ** argv)
+{
+	glutInit( &argc, argv );
+
+	glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH );
+	glutInitWindowSize( windowWidth, windowHeight );    
+	glutCreateWindow ("OpenCL Renderer");
+
+	initGL();
+
+	glViewport( 0, 0, windowWidth, windowHeight);
+
+	reshape( windowWidth, windowHeight );
+
+	glutDisplayFunc(display); 
+	glutReshapeFunc(reshape);
+	glutKeyboardFunc(keyboard);
+	glutMouseFunc(mouse);
+	glutMotionFunc(motion);
+}
+
+/*
+int getVBO( std::string, int s)
+{
+	GLuint size = (GLuint)s;
+	if (vbo == 0) {
+		// Create VBO
+		// create buffer object
+		glGenBuffers(1, &vbo);
+		glBindBuffer(GL_ARRAY_BUFFER, vbo);
+		glBufferData(GL_ARRAY_BUFFER, size, 0, GL_STATIC_DRAW);
+		glBindBuffer(GL_ARRAY_BUFFER, 0);
+	}
+
+	return vbo;
+}
+*/
diff --git a/Demos/OpenCLClothDemo/gl_win.h b/Demos/OpenCLClothDemo/gl_win.h
new file mode 100644
index 000000000..db5439f07
--- /dev/null
+++ b/Demos/OpenCLClothDemo/gl_win.h
@@ -0,0 +1,49 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2008 Advanced Micro Devices
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef __GL_WIN_HDR__
+#define __GL_WIN_HDR__
+
+#ifdef _WIN32//for glut.h
+#include <windows.h>
+#endif
+
+//think different
+#if defined(__APPLE__) && !defined (VMDMESA)
+#include <OpenGL/OpenGL.h>
+#include <OpenGL/gl.h>
+#include <OpenGL/glu.h>
+#include <GLUT/glut.h>
+#else
+
+
+#ifdef _WINDOWS
+#include <windows.h>
+#include <GL/gl.h>
+#include <GL/glu.h>
+#else
+#include <GL/glut.h>
+#endif //_WINDOWS
+#endif //APPLE
+
+
+#include <string>
+
+void goGL(void);
+void preInitGL(int argc, char ** argv);
+
+//int getVBO( std::string, int size );
+
+#endif //__GL_WIN_HDR__
diff --git a/Demos/OpenCLClothDemo/gl_win.hpp b/Demos/OpenCLClothDemo/gl_win.hpp
new file mode 100644
index 000000000..e7d3f9388
--- /dev/null
+++ b/Demos/OpenCLClothDemo/gl_win.hpp
@@ -0,0 +1,34 @@
+#ifndef __GL_WIN_HDR__
+#define __GL_WIN_HDR__
+
+#ifdef _WIN32//for glut.h
+#include <windows.h>
+#endif
+
+//think different
+#if defined(__APPLE__) && !defined (VMDMESA)
+#include <OpenGL/OpenGL.h>
+#include <OpenGL/gl.h>
+#include <OpenGL/glu.h>
+#include <GLUT/glut.h>
+#else
+
+
+#ifdef _WINDOWS
+#include <windows.h>
+#include <GL/gl.h>
+#include <GL/glu.h>
+#else
+#include <GL/glut.h>
+#endif //_WINDOWS
+#endif //APPLE
+
+
+#include <string>
+
+void goGL(void);
+void preInitGL(int argc, char ** argv);
+
+int getVBO( std::string, int size );
+
+#endif //__GL_WIN_HDR__
diff --git a/Demos/OpenCLClothDemo/shaders.cl b/Demos/OpenCLClothDemo/shaders.cl
new file mode 100644
index 000000000..27e2d219f
--- /dev/null
+++ b/Demos/OpenCLClothDemo/shaders.cl
@@ -0,0 +1,535 @@
+#pragma OPENCL EXTENSION cl_amd_printf : enable                        
+
+#define float3 float4
+#define uint3  uint4
+
+#define PARTICLE_RADIUS 0.05;
+
+#define width 1280
+#define height 1024
+
+#define B 0 
+#define T height
+#define L 0
+#define R width
+
+#define shiftNumber 4
+#define shiftMask 0xF
+#define shiftValue 16.0f
+#define stride 4
+
+#define screenWidth1 width
+#define screenHeight1 height
+#define halfScreenWidth1 screenWidth1/2
+#define halfScreenHeight1 screenHeight1/2
+#define screenWidth1SubOne (screenWidth1-1)
+#define screenHeight1SubOne (screenHeight1-1)
+#define stride screenWidth1 
+#define screenPixelNumber screenWidth1*screenHeight1
+#define depthBufferSize screenPixelNumber*depthComplexity
+
+#define WGS 1
+
+//---------------------------------------------------------------
+
+struct __VSSpriteOut
+{
+    float4 position; 
+    float4 particlePosition; 
+};
+
+typedef struct __VSSpriteout VSSpriteOut;
+
+struct __GSSpriteOut
+{
+    float4 position;
+    float2 textureUV;
+//	float4 viewSpacePosition;
+//	float4 particlePosition;
+};
+
+typedef struct __GSSpriteout GSSpriteOut;
+
+//------------------------------------------------------------------------------
+
+__constant float4 g_positions[4] =
+{
+  (float4)(-1.0f, 1.0f, 0.0f, 0.0f),
+  (float4)( 1.0f, 1.0f, 0.0f, 0.0f),
+  (float4)( -1.0f, -1.0f, 0.0f, 0.0f),
+  (float4)( 1.0f, -1.0f, 0.0f, 0.0f)
+};
+
+__constant float2 g_texcoords[4] = 
+{ 
+	(float2)(0.0f,0.0f), 
+    (float2)(1.0f,0.0f),
+    (float2)(0.0f,1.0f),
+    (float2)(1.0f,1.0f)
+};
+
+//------------------------------------------------------------------------------
+
+void copyMatrix(
+	float matrix[16],
+	__constant float matrix0[16])
+{
+	uint i;
+	
+	for (i = 0; i < 16; i++) {
+		matrix[i] = matrix0[i];
+	}
+}
+
+void matrixMulLoopBody(	
+	uint i,
+	float matrix[16], 
+	__constant float matrix0[16], 
+	__constant float matrix1[16])
+{
+	matrix[i] = 0.0f;
+	matrix[i] += matrix0[(i%4) + (0*4)] * matrix1[(0) + ((i/4)*4)];
+	matrix[i] += matrix0[(i%4) + (1*4)] * matrix1[(1) + ((i/4)*4)];
+	matrix[i] += matrix0[(i%4) + (2*4)] * matrix1[(2) + ((i/4)*4)];
+	matrix[i] += matrix0[(i%4) + (3*4)] * matrix1[(3) + ((i/4)*4)];
+}
+
+void matrixMul(
+	float matrix[16], 
+	__constant float matrix0[16], 
+	__constant float matrix1[16])
+{
+	matrixMulLoopBody(0, matrix, matrix0, matrix1);
+	matrixMulLoopBody(1, matrix, matrix0, matrix1);
+	matrixMulLoopBody(2, matrix, matrix0, matrix1);	
+	matrixMulLoopBody(3, matrix, matrix0, matrix1);
+	matrixMulLoopBody(4, matrix, matrix0, matrix1);
+	matrixMulLoopBody(5, matrix, matrix0, matrix1);
+	matrixMulLoopBody(6, matrix, matrix0, matrix1);	
+	matrixMulLoopBody(7, matrix, matrix0, matrix1);
+	matrixMulLoopBody(8, matrix, matrix0, matrix1);
+	matrixMulLoopBody(9, matrix, matrix0, matrix1);
+	matrixMulLoopBody(10, matrix, matrix0, matrix1);	
+	matrixMulLoopBody(11, matrix, matrix0, matrix1);
+	matrixMulLoopBody(12, matrix, matrix0, matrix1);
+	matrixMulLoopBody(13, matrix, matrix0, matrix1);
+	matrixMulLoopBody(14, matrix, matrix0, matrix1);	
+	matrixMulLoopBody(15, matrix, matrix0, matrix1);						
+}
+
+float4 matrixVectorMul(float matrix[16], float4 vector)
+{
+	float4 result;
+
+	result.x = matrix[0]*vector.x + matrix[4+0]*vector.y + matrix[8+0]*vector.z + matrix[12+0]*vector.w;
+	result.y = matrix[1]*vector.x + matrix[4+1]*vector.y + matrix[8+1]*vector.z + matrix[12+1]*vector.w;
+	result.z = matrix[2]*vector.x + matrix[4+2]*vector.y + matrix[8+2]*vector.z + matrix[12+2]*vector.w;
+	result.w = matrix[3]*vector.x + matrix[4+3]*vector.y + matrix[8+3]*vector.z + matrix[12+3]*vector.w;
+
+	return result;
+}
+
+float3 matrixVector3Mul(__constant float matrix[9], float3 vector)
+{
+	float3 result;
+
+	result.x = matrix[0]*vector.x + matrix[3+0]*vector.y + matrix[6+0]*vector.z;
+	result.y = matrix[1]*vector.x + matrix[3+1]*vector.y + matrix[6+1]*vector.z;
+	result.z = matrix[2]*vector.x + matrix[3+2]*vector.y + matrix[6+2]*vector.z;
+
+	return result;
+}
+
+//------------------------------------------------------------------------------
+
+//#define DEVICE_CPU 1
+#if defined(DEVICE_CPU)
+void printMatrix(char * name, __constant float matrix[16])
+{
+	printf("%s[0] = %f, %f, %f, %f\n", name, matrix[0], matrix[1], matrix[2], matrix[3]);	
+	printf("%s[1] = %f, %f, %f, %f\n", name, matrix[4], matrix[5], matrix[6], matrix[7]);	
+	printf("%s[2] = %f, %f, %f, %f\n", name, matrix[8], matrix[9], matrix[10], matrix[11]);	
+	printf("%s[3] = %f, %f, %f, %f\n", name, matrix[12], matrix[13], matrix[14], matrix[15]);	
+}
+#endif
+
+#if 1
+__kernel void vertexShader(
+    __constant float modelview[16],
+	__constant float projection[16],
+	__global float4 * inputPrimitives, 
+	__global float4 * outputPrimitives)
+{
+	float matrix[16];
+	float4 gl_Vertex;
+	float4 gl_Position;
+
+	uint id = get_global_id(0);
+	
+	gl_Vertex = inputPrimitives[id];
+
+	// gl_ProjectionMatrix * gl_ModelViewMatrix * gl_Vertex
+	matrixMul(matrix, projection, modelview);
+	
+	gl_Position = matrixVectorMul(matrix, gl_Vertex);
+
+	outputPrimitives[id] = gl_Position;
+}
+
+#else
+
+__kernel void vertexShader(
+    __constant float modelview[16],
+	__constant float projection[16],
+	__global float4 * inputPrimitives, 
+	__global float4 * outputPrimitives)
+{
+	uint id = get_global_id(0);
+
+	outputPrimitives[id] = inputPrimitives[id];
+}
+
+#endif
+
+//-----------------------------------------------------------------------------------
+
+__kernel void
+clearImage(
+	__write_only image2d_t image,
+	float4 color)
+{
+
+	int2 coords = (int2)(get_global_id(0), get_global_id(1));
+	write_imagef(image, coords, color);
+}
+
+// OpenGL viewport transformation
+// The site http://research.cs.queensu.ca/~jstewart/454/notes/pipeline/
+// contains a description of this process
+void 
+viewportTransform(float4 v, __constant int4 viewport[1], float2 * output)
+{
+	int4 vp = viewport[0];
+	*output 
+		= 0.5f * 
+		  (float2)(v.x+1,v.y+1) * 
+		  (float2)((vp.s2-vp.s0) + vp.s0, 
+				   (vp.s3-vp.s1) + vp.s1);
+}
+
+#define PARTICLE_WIDTH  32.0f
+#define PARTICLE_HEIGHT 32.0f
+
+// Unoptimized triangle rasterizer function
+// Details of the algorithm can be found here:
+//		http://www.devmaster.net/forums/showthread.php?t=1884
+//	
+void
+rasterizerUnOpt(
+    __global struct __GSSpriteOut * outputPrimitives,
+//	 __global float4 * outputPrimitives,
+	__constant int4  viewport[1],
+	__write_only image2d_t screen,
+	__read_only image2d_t particle,
+	uint v1Offset,
+	uint v2Offset,
+	uint v3Offset,
+	__global float4 * debugOut1)
+{
+	sampler_t sampler = 
+		CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+	uint id = get_global_id(0);
+
+	struct __GSSpriteOut output;
+	float2 v1, v2, v3;
+	float2 uv1, uv2, uv3;
+
+	output = outputPrimitives[id*4+v1Offset];
+	uv1    = output.textureUV;
+	viewportTransform(output.position, viewport, &v1);
+
+	output = outputPrimitives[id*4+v2Offset];
+	uv2    = output.textureUV;
+	viewportTransform(output.position, viewport, &v2);
+
+	output = outputPrimitives[id*4+v3Offset];
+	uv3    = output.textureUV;
+	viewportTransform(output.position, viewport, &v3);
+
+	// Bounding rectangle
+	int2 min_ = convert_int2(min(v1, min(v2, v3)));
+	int2 max_ = convert_int2(max(v1, max(v2, v3)));
+
+	// naive bi-linear interploation for texture coords, note this is 
+	// broken with respect to OpenGL and needs to be fixed for the 
+	// general case.
+	float p1x = v2.x - v1.x;
+	float p1y = v2.y - v1.y;
+	
+	float p2x = v3.x - v1.x;
+	float p2y = v3.y - v1.y;
+
+	// Scan through bounding rectangle
+	for(int y = min_.y; y < max_.y; y++) {
+		for(int x = min_.x; x < max_.x; x++) {
+			// When all half-space functions positive, pixel is in triangle
+			if((v1.x - v2.x) * (y - v1.y) - (v1.y - v2.y) * (x - v1.x) > 0 &&
+			 (v2.x - v3.x) * (y - v2.y) - (v2.y - v3.y) * (x - v2.x) > 0 &&
+			 (v3.x - v1.x) * (y - v3.y) - (v3.y - v1.y) * (x - v3.x) > 0) {
+
+				float px = x - v1.x;
+				float py = y - v1.y;
+	
+					write_imagef(
+						screen, 
+						(int2)(x,y), 
+					//	texel);
+						(float4)(1.0f,1.0f,1.0f,1.0f));
+			}
+		}
+	}
+}
+
+// Optimized rasterizer function
+// Details of the algorithm can be found here:
+//		http://www.devmaster.net/forums/showthread.php?t=1884
+//	
+// Currently has a bug, still work in progess
+__kernel void
+rasterizerXX(
+    __global float4 * outputPrimitives,
+	__write_only image2d_t screen,
+	__global float4 * debugOut1,
+	__global int2 * debugOut2)
+{
+	uint id = get_global_id(0);
+
+//	printf("ras\n");
+
+	float4 v1 = outputPrimitives[id*4+0];
+	float4 v2 = outputPrimitives[id*4+1];
+	float4 v3 = outputPrimitives[id*4+2];
+
+	float y1 = 0.5f* (v1.y+1) * (T - B) + B;
+	float y2 = 0.5f* (v2.y+1) * (T - B) + B;
+    float y3 = 0.5f* (v3.y+1) * (T - B) + B;
+
+    float x1 = 0.5f * (v1.x+1) * (R - L) + L;
+	float x2 = 0.5f * (v2.x+1) * (R - L) + L;
+	float x3 = 0.5f * (v3.x+1) * (R - L) + L;
+
+    const int Y1 = convert_int(shiftValue * y1);
+    const int Y2 = convert_int(shiftValue * y2);
+    const int Y3 = convert_int(shiftValue * y3);
+
+    const int X1 = convert_int(shiftValue * x1);
+    const int X2 = convert_int(shiftValue * x2);
+    const int X3 = convert_int(shiftValue * x3);
+
+	debugOut1[id*4+0]   = v1;
+	debugOut1[id*4+1]   = v2;
+	debugOut1[id*4+2]   = v3;
+
+	debugOut2[id*3+0] = (int2)(X1, Y1);
+	debugOut2[id*3+1] = (int2)(X2, Y2);
+	debugOut2[id*3+2] = (int2)(X3, Y3);
+
+    // Deltas
+    const int DX12 = X1 - X2;
+    const int DX23 = X2 - X3;
+    const int DX31 = X3 - X1;
+
+    const int DY12 = Y1 - Y2;
+    const int DY23 = Y2 - Y3;
+    const int DY31 = Y3 - Y1;
+
+    // Fixed-point deltas
+    const int FDX12 = DX12 << shiftNumber;
+    const int FDX23 = DX23 << shiftNumber;
+    const int FDX31 = DX31 << shiftNumber;
+
+    const int FDY12 = DY12 << shiftNumber;
+    const int FDY23 = DY23 << shiftNumber;
+    const int FDY31 = DY31 << shiftNumber;
+
+    // Bounding rectangle
+    int minx = (min(X1, min(X2, X3)) + shiftMask) >> shiftNumber;
+	//minx = max(0,minx);
+    
+	int maxx = (max(X1, min(X2, X3)) + shiftMask) >> shiftNumber;
+	//min(maxx , screenWidth1SubOne);
+
+	int miny = (min(Y1, min(Y2, Y3)) + shiftMask) >> shiftNumber;
+    //max(0,miny);
+
+	int maxy = (max(Y1, min(Y2, Y3)) + shiftMask) >> shiftNumber;
+	//min(maxy , screenHeight1SubOne);
+
+    //(char*&)colorBuffer += miny * stride;
+	int offset = miny * stride;
+
+    // Half-edge constants
+    int C1 = DY12 * X1 - DX12 * Y1;
+    int C2 = DY23 * X2 - DX23 * Y2;
+    int C3 = DY31 * X3 - DX31 * Y3;
+
+    // Correct for fill convention
+    if(DY12 < 0 || (DY12 == 0 && DX12 > 0)) C1++;
+    if(DY23 < 0 || (DY23 == 0 && DX23 > 0)) C2++;
+    if(DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++;
+
+    int CY1 = C1 + DX12 * (miny << shiftNumber) - DY12 * (minx << shiftNumber);
+    int CY2 = C2 + DX23 * (miny << shiftNumber) - DY23 * (minx << shiftNumber);
+    int CY3 = C3 + DX31 * (miny << shiftNumber) - DY31 * (minx << shiftNumber);
+
+    for(int y = miny; y < maxy; y++) {
+        int CX1 = CY1;
+        int CX2 = CY2;
+        int CX3 = CY3;
+
+		debugOut2[id*3+0] = (int2)(minx, maxx);
+
+        for(int x = minx; x < maxx; x++) {
+			debugOut2[id*3+0] = (int2)(CX1, CX2);
+
+            if(CX1 > 0 && CX2 > 0 && CX3 > 0) {
+				debugOut2[id*3+0] = (int2)(1, 1);
+				write_imagef(
+					screen, 
+					(int2)(x,y), 
+					(float4)(1.0f,1.0f,1.0f,1.0f));
+           }
+
+            CX1 -= FDY12;
+            CX2 -= FDY23;
+            CX3 -= FDY31;
+        }
+
+        CY1 += FDX12;
+        CY2 += FDX23;
+        CY3 += FDX31;
+
+        //(char*&)colorBuffer += stride;
+		offset += stride;
+    }
+}
+
+//------------------------------------------------------------------------------
+
+void geometryShader(
+    __constant float modelview[16],
+	__constant float projection[16],
+	__constant float inverseView[9],
+	__constant int4  viewport[1],
+	__local struct __VSSpriteOut  * vsOutputPrimitives,
+	__global struct __GSSpriteOut * outputPrimitives,
+//	 __global float4 * outputPrimitives,
+	__write_only image2d_t screen,
+	__read_only image2d_t particle,
+	__global float4 * debugOut1,
+	__global int * debugOut2)
+{
+	float2 texcoords[4] = 
+	{ 
+		(float2)(0.0f,0.0f), 
+		(float2)(1.0f,0.0f),
+		(float2)(0.0f,1.0f),
+		(float2)(1.0f,1.0f)
+	};
+
+	float matrix[16];
+
+	uint id  = get_global_id(0);
+	uint lid = get_local_id(0);
+	
+	float4 vsPosition = vsOutputPrimitives[lid].position;
+
+	matrixMul(matrix, projection, modelview);
+	//
+	// Emit two new triangles
+	//
+	for (uint i = 0; i<4; i++) {
+		float3 position = g_positions[i] * PARTICLE_RADIUS;
+		position        = matrixVector3Mul(inverseView, position) + vsPosition;
+		float3 particlePosition = 
+			matrixVector3Mul( 
+				inverseView, 
+				(float4)(0.0f,0.0f,0.0f,0.0f)) + vsPosition;	// world space
+	
+		// Compute view space position
+		position.w               = 1.0f;
+		position                 = matrixVectorMul(matrix, position);
+
+		//perspective division
+		position /= position.w;
+
+		struct __GSSpriteOut output;
+		output.position  = position;
+		//output.textureUV = g_texcoords[i];
+		output.textureUV = texcoords[i];
+		outputPrimitives[id*4+i] = output; 
+	}	
+
+	// Render QUAD - Triangle 1
+	rasterizerUnOpt(
+		outputPrimitives,
+		viewport,
+		screen,
+		particle,
+		0,
+		1,
+		2,
+		debugOut1);
+
+	// Render QUAD - Triangle 2
+	rasterizerUnOpt(
+		outputPrimitives,
+		viewport,
+		screen,
+		particle,
+		2,
+		1,
+		3,
+		debugOut1);
+}
+
+__kernel void vertexShaderSprite(
+    __constant float modelview[16],
+	__constant float projection[16],
+	__constant float inverseView[9],
+	__constant int4  viewport[1],
+	__local struct __VSSpriteOut  * vsOutputPrimitives,
+	__global float4               * inputPrimitives, 	
+	__global struct __GSSpriteOut * outputPrimitives,
+//	__global float4 * outputPrimitives,
+	__write_only image2d_t screen,
+	__read_only image2d_t particle,
+	__global float4 * debugOut1,
+	__global int * debugOut2)
+{
+	float matrix[16];
+
+	uint id  = get_global_id(0);
+	uint lid = get_local_id(0);
+
+	// gl_ProjectionMatrix * gl_ModelViewMatrix * gl_Vertex
+	matrixMul(matrix, projection, modelview);
+
+	float4 position                          = inputPrimitives[id];
+	vsOutputPrimitives[lid].position         = position;
+    vsOutputPrimitives[lid].particlePosition = 
+		matrixVectorMul(matrix, position); 
+	
+	geometryShader(
+		modelview, 
+		projection, 
+		inverseView, 
+		viewport,
+		vsOutputPrimitives, 
+		outputPrimitives,
+		screen,
+		particle,
+		debugOut1,
+		debugOut2);
+}
\ No newline at end of file
diff --git a/Demos/OpenCLClothDemo/texture1.bmp b/Demos/OpenCLClothDemo/texture1.bmp
new file mode 100644
index 000000000..1d3da81c1
Binary files /dev/null and b/Demos/OpenCLClothDemo/texture1.bmp differ
diff --git a/Demos/OpenCLClothDemo/vertex.glsl b/Demos/OpenCLClothDemo/vertex.glsl
new file mode 100644
index 000000000..516983023
--- /dev/null
+++ b/Demos/OpenCLClothDemo/vertex.glsl
@@ -0,0 +1,7 @@
+void main()
+{
+	//gl_Position = gl_ProjectionMatrix * gl_ModelViewMatrix * gl_Vertex;
+
+	gl_TexCoord[0] = gl_MultiTexCoord0;
+    gl_Position    = gl_Vertex;
+}
\ No newline at end of file
diff --git a/Demos/ParticlesOpenCL/AMD/CMakeLists.txt b/Demos/ParticlesOpenCL/AMD/CMakeLists.txt
index 727006017..f4c4422b6 100644
--- a/Demos/ParticlesOpenCL/AMD/CMakeLists.txt
+++ b/Demos/ParticlesOpenCL/AMD/CMakeLists.txt
@@ -7,6 +7,8 @@ ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL
 )
 
 ADD_DEFINITIONS(-DUSE_AMD_OPENCL)
+ADD_DEFINITIONS(-DCL_PLATFORM_AMD)
+
 
 IF (INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 	INCLUDE_DIRECTORIES(		$ENV{==ATISTREAMSDKROOT=}/include )
@@ -53,15 +55,17 @@ IF (USE_GLUT)
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/btParticlesSharedDefs.h
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/btParticlesSharedTypes.h
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesDemo.h
-					${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.h
+					
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/shaders.h	
+					${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.h
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.h
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.cpp
+					${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.cpp
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/main.cpp
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesDemo.cpp
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/shaders.cpp
-					${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.cpp
+					
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesOCL.cl
 	)
 ELSE (USE_GLUT)
diff --git a/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp b/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp
index 4da7f523c..f35e44d37 100644
--- a/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp
+++ b/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp
@@ -329,7 +329,9 @@ void btParticlesDynamicsWorld::initCLKernels(int argc, char** argv)
 	if (!m_cxMainContext)
 	{
 //		m_cxMainContext = clCreateContextFromType(0, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErrNum);
-		m_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum);
+
+		m_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum);
+		//m_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum);
 		oclCHECKERROR(ciErrNum, CL_SUCCESS);
 		m_cdDevice = btOclGetMaxFlopsDev(m_cxMainContext);
 		
diff --git a/Demos/SharedOpenCL/btOclCommon.cpp b/Demos/SharedOpenCL/btOclCommon.cpp
index d412ef3c6..48fe105d7 100644
--- a/Demos/SharedOpenCL/btOclCommon.cpp
+++ b/Demos/SharedOpenCL/btOclCommon.cpp
@@ -85,7 +85,7 @@ cl_context btOclCommon::createContextFromType(cl_device_type deviceType, cl_int*
 	/* Use NULL for backward compatibility */    
 	cl_context_properties* cprops = (NULL == platform) ? NULL : cps;
     cl_context retContext = clCreateContextFromType(cprops, 
-													CL_DEVICE_TYPE_ALL,                  
+													deviceType,                  
 													NULL,                  
 													NULL,                  
 													&ciErrNum);
diff --git a/Demos/SharedOpenCL/btOclUtils.cpp b/Demos/SharedOpenCL/btOclUtils.cpp
index 7af73b92a..6e0823227 100644
--- a/Demos/SharedOpenCL/btOclUtils.cpp
+++ b/Demos/SharedOpenCL/btOclUtils.cpp
@@ -1,3 +1,18 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2010 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
diff --git a/Demos/SharedOpenCL/btOclUtils.h b/Demos/SharedOpenCL/btOclUtils.h
index fba65d8c6..309deca50 100644
--- a/Demos/SharedOpenCL/btOclUtils.h
+++ b/Demos/SharedOpenCL/btOclUtils.h
@@ -1,3 +1,17 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2010 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
 
 #ifndef BT_OCL_UTILS_H
 #define BT_OCL_UTILS_H
diff --git a/Demos/VectorAdd_OpenCL/VectorAddKernels.cl b/Demos/VectorAdd_OpenCL/VectorAddKernels.cl
index e224eb6ff..f3d5b3486 100644
--- a/Demos/VectorAdd_OpenCL/VectorAddKernels.cl
+++ b/Demos/VectorAdd_OpenCL/VectorAddKernels.cl
@@ -1,13 +1,4 @@
 
-#ifndef GUID_ARG
-#define GUID_ARG
-#endif
-
-
-#ifndef MSTRINGIFY
-#define MSTRINGIFY(A) A
-#endif
-
 
 MSTRINGIFY(
 
diff --git a/src/BulletMultiThreaded/CMakeLists.txt b/src/BulletMultiThreaded/CMakeLists.txt
index b4207a72b..6267a5307 100644
--- a/src/BulletMultiThreaded/CMakeLists.txt
+++ b/src/BulletMultiThreaded/CMakeLists.txt
@@ -67,10 +67,8 @@ ADD_LIBRARY(BulletMultiThreaded
 
 )
 
-#for now, only Direct 11 (Direct Compute)
-IF(USE_DX11)
-	SUBDIRS(GpuSoftBodySolvers)
-ENDIF(USE_DX11)
+
+SUBDIRS(GpuSoftBodySolvers)
 
 
 IF (BUILD_SHARED_LIBS)
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/CMakeLists.txt b/src/BulletMultiThreaded/GpuSoftBodySolvers/CMakeLists.txt
index aaf2e4bef..63cc88b7a 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/CMakeLists.txt
@@ -3,20 +3,12 @@ INCLUDE_DIRECTORIES(
 ${BULLET_PHYSICS_SOURCE_DIR}/src
 )
 
-LIST(APPEND SubDirList "CPU")
 
+SUBDIRS ( 
+	OpenCL
+	CPU 
+)
 
-# Configure use of OpenCL and DX11
-# Generates the settings file and defines libraries and include paths
-OPTION(USE_OPENCL "Use OpenCL"	OFF)
-
-
-
-if( USE_OPENCL )
-	LIST(APPEND SubDirList "OpenCL")
-endif( USE_OPENCL )
-if( USE_DX11 )
-	LIST(APPEND SubDirList "DX11")
-endif( USE_DX11 )
-
-SUBDIRS( ${SubDirList} )
+IF( USE_DX11 )
+	SUBDIRS( DX11 )
+ENDIF( USE_DX11 )
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/CMakeLists.txt b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/CMakeLists.txt
index 5fea665b0..3bfffcdcb 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/CMakeLists.txt
@@ -14,14 +14,17 @@ ${VECTOR_MATH_INCLUDE}
 
 SET(BulletSoftBodyDX11Solvers_SRCS
 	btSoftBodySolver_DX11.cpp
+	btSoftBodySolver_DX11SIMDAware.cpp
 )
 
 SET(BulletSoftBodyDX11Solvers_HDRS
 	btSoftBodySolver_DX11.h
+	btSoftBodySolver_DX11SIMDAware.h
 	../cpu/btSoftBodySolverData.h
 	btSoftBodySolverVertexData_DX11.h
 	btSoftBodySolverTriangleData_DX11.h
 	btSoftBodySolverLinkData_DX11.h
+	btSoftBodySolverLinkData_DX11SIMDAware.h
 	btSoftBodySolverBuffer_DX11.h
 	btSoftBodySolverVertexBuffer_DX11.h
 
@@ -37,6 +40,7 @@ SET(BulletSoftBodyDX11Solvers_Shaders
 	UpdatePositions
 	UpdateNodes
 	SolvePositions
+	SolvePositionsSIMDBatched
 	UpdatePositionsFromVelocities
 	ApplyForces
 	PrepareLinks
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositionsSIMDBatched.hlsl b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositionsSIMDBatched.hlsl
new file mode 100644
index 000000000..5106f612d
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositionsSIMDBatched.hlsl
@@ -0,0 +1,128 @@
+MSTRINGIFY(
+
+cbuffer SolvePositionsFromLinksKernelCB : register( b0 )
+{
+	int startWaveInBatch;
+	int numWaves;
+	float kst;		
+	float ti;
+};
+
+
+// Number of batches per wavefront stored one element per logical wavefront
+StructuredBuffer<int2> g_wavefrontBatchCountsVertexCounts : register( t0 );
+// Set of up to maxNumVertices vertex addresses per wavefront
+StructuredBuffer<int> g_vertexAddressesPerWavefront : register( t1 );
+
+StructuredBuffer<float> g_verticesInverseMass : register( t2 );
+
+// Per-link data layed out structured in terms of sub batches within wavefronts
+StructuredBuffer<int2> g_linksVertexIndices : register( t3 );
+StructuredBuffer<float> g_linksMassLSC : register( t4 );
+StructuredBuffer<float> g_linksRestLengthSquared : register( t5 );
+
+RWStructuredBuffer<float4> g_vertexPositions : register( u0 );
+
+// Data loaded on a per-wave basis
+groupshared int2 wavefrontBatchCountsVertexCounts[WAVEFRONT_BLOCK_MULTIPLIER];
+groupshared float4 vertexPositionSharedData[MAX_NUM_VERTICES_PER_WAVE*WAVEFRONT_BLOCK_MULTIPLIER];
+groupshared float vertexInverseMassSharedData[MAX_NUM_VERTICES_PER_WAVE*WAVEFRONT_BLOCK_MULTIPLIER];
+
+// Storing the vertex addresses actually slowed things down a little
+//groupshared int vertexAddressSharedData[MAX_NUM_VERTICES_PER_WAVE*WAVEFRONT_BLOCK_MULTIPLIER];
+
+
+[numthreads(BLOCK_SIZE, 1, 1)]
+void 
+SolvePositionsFromLinksKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
+{
+	const int laneInWavefront = (DTid.x & (WAVEFRONT_SIZE-1));
+	const int wavefront = startWaveInBatch + (DTid.x / WAVEFRONT_SIZE);
+	const int firstWavefrontInBlock = startWaveInBatch + Gid.x * WAVEFRONT_BLOCK_MULTIPLIER;
+	const int localWavefront = wavefront - firstWavefrontInBlock;
+
+	// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier	
+	if( wavefront < (startWaveInBatch + numWaves) )
+	{
+
+		// Load the batch counts for the wavefronts
+		// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier
+		if( laneInWavefront == 0 )
+		{
+			int2 batchesAndVertexCountsWithinWavefront = g_wavefrontBatchCountsVertexCounts[firstWavefrontInBlock + localWavefront];
+			wavefrontBatchCountsVertexCounts[localWavefront] = batchesAndVertexCountsWithinWavefront;
+		}
+
+		
+		int2 batchesAndVerticesWithinWavefront = wavefrontBatchCountsVertexCounts[localWavefront];
+		int batchesWithinWavefront = batchesAndVerticesWithinWavefront.x;
+		int verticesUsedByWave = batchesAndVerticesWithinWavefront.y;
+
+		// Load the vertices for the wavefronts
+		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
+		{
+			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+
+			//vertexAddressSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = vertexAddress;
+			vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_vertexPositions[vertexAddress];
+			vertexInverseMassSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_verticesInverseMass[vertexAddress];
+		}
+		
+		// Loop through the batches performing the solve on each in LDS
+		int baseDataLocationForWave = WAVEFRONT_SIZE * wavefront * MAX_BATCHES_PER_WAVE;	
+
+		//for( int batch = 0; batch < batchesWithinWavefront; ++batch )
+		
+		int batch = 0;
+		do
+		{
+			int baseDataLocation = baseDataLocationForWave + WAVEFRONT_SIZE * batch;
+			int locationOfValue = baseDataLocation + laneInWavefront;
+			
+			
+			// These loads should all be perfectly linear across the WF
+			int2 localVertexIndices = g_linksVertexIndices[locationOfValue];
+			float massLSC = g_linksMassLSC[locationOfValue];
+			float restLengthSquared = g_linksRestLengthSquared[locationOfValue];
+			
+
+			// LDS vertex addresses based on logical wavefront number in block and loaded index
+			int vertexAddress0 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.x;
+			int vertexAddress1 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.y;
+			
+			float3 position0 = vertexPositionSharedData[vertexAddress0].xyz;
+			float3 position1 = vertexPositionSharedData[vertexAddress1].xyz;
+
+			float inverseMass0 = vertexInverseMassSharedData[vertexAddress0];
+			float inverseMass1 = vertexInverseMassSharedData[vertexAddress1]; 
+
+			float3 del = position1 - position0;
+			float len = dot(del, del);
+			
+			float k = 0;
+			if( massLSC > 0.0f )
+			{		
+				k = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
+			}
+			
+			position0 = position0 - del*(k*inverseMass0);
+			position1 = position1 + del*(k*inverseMass1);
+
+			vertexPositionSharedData[vertexAddress0] = float4(position0, 0.f);
+			vertexPositionSharedData[vertexAddress1] = float4(position1, 0.f);
+			
+			++batch;
+		} while( batch < batchesWithinWavefront );
+		
+		// Update the global memory vertices for the wavefronts
+		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
+		{
+			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+
+			g_vertexPositions[vertexAddress] = vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
+		}
+	}
+		
+}
+
+);
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11SIMDAware.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11SIMDAware.h
new file mode 100644
index 000000000..92864a159
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11SIMDAware.h
@@ -0,0 +1,173 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_DX11.h"
+
+#ifndef BT_ACCELERATED_SOFT_BODY_LINK_DATA_DX11_SIMDAWARE_H
+#define BT_ACCELERATED_SOFT_BODY_LINK_DATA_DX11_SIMDAWARE_H
+
+struct ID3D11Device;
+struct ID3D11DeviceContext;
+
+
+class btSoftBodyLinkDataDX11SIMDAware : public btSoftBodyLinkData
+{
+public:
+	bool				m_onGPU;
+	ID3D11Device		*m_d3dDevice;
+	ID3D11DeviceContext *m_d3dDeviceContext;
+
+	const int m_wavefrontSize;
+	const int m_linksPerWorkItem;
+	const int m_maxLinksPerWavefront;
+	int m_maxBatchesWithinWave;
+	int m_maxVerticesWithinWave;
+	int m_numWavefronts;
+
+	int m_maxVertex;
+
+	struct NumBatchesVerticesPair
+	{
+		int numBatches;
+		int numVertices;
+	};
+
+	// Array storing number of links in each wavefront
+	btAlignedObjectArray<int>									m_linksPerWavefront;
+	btAlignedObjectArray<NumBatchesVerticesPair>				m_numBatchesAndVerticesWithinWaves;
+	btDX11Buffer< NumBatchesVerticesPair >						m_dx11NumBatchesAndVerticesWithinWaves;
+
+	// All arrays here will contain batches of m_maxLinksPerWavefront links
+	// ordered by wavefront.
+	// with either global vertex pairs or local vertex pairs
+	btAlignedObjectArray< int >									m_wavefrontVerticesGlobalAddresses; // List of global vertices per wavefront
+	btDX11Buffer<int>											m_dx11WavefrontVerticesGlobalAddresses;
+	btAlignedObjectArray< LinkNodePair >						m_linkVerticesLocalAddresses; // Vertex pair for the link
+	btDX11Buffer<LinkNodePair>									m_dx11LinkVerticesLocalAddresses;
+	btDX11Buffer<float>											m_dx11LinkStrength;
+	btDX11Buffer<float>											m_dx11LinksMassLSC;
+	btDX11Buffer<float>											m_dx11LinksRestLengthSquared;
+	btDX11Buffer<float>											m_dx11LinksRestLength;
+	btDX11Buffer<float>											m_dx11LinksMaterialLinearStiffnessCoefficient;
+
+	struct BatchPair
+	{
+		int start;
+		int length;
+
+		BatchPair() :
+			start(0),
+			length(0)
+		{
+		}
+
+		BatchPair( int s, int l ) : 
+			start( s ),
+			length( l )
+		{
+		}
+	};
+
+	/**
+	 * Link addressing information for each cloth.
+	 * Allows link locations to be computed independently of data batching.
+	 */
+	btAlignedObjectArray< int >							m_linkAddresses;
+
+	/**
+	 * Start and length values for computation batches over link data.
+	 */
+	btAlignedObjectArray< BatchPair >		m_wavefrontBatchStartLengths;
+
+
+	//ID3D11Buffer*               readBackBuffer;
+	
+	btSoftBodyLinkDataDX11SIMDAware( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext );
+
+	virtual ~btSoftBodyLinkDataDX11SIMDAware();
+
+	/** Allocate enough space in all link-related arrays to fit numLinks links */
+	virtual void createLinks( int numLinks );
+	
+	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
+	virtual void setLinkAt( const LinkDescription &link, int linkIndex );
+
+	virtual bool onAccelerator();
+
+	virtual bool moveToAccelerator();
+
+	virtual bool moveFromAccelerator();
+
+	/**
+	 * Generate (and later update) the batching for the entire link set.
+	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
+	 * In theory we could delay it until just before we need the cloth.
+	 * It's a one-off overhead, though, so that is a later optimisation.
+	 */
+	void generateBatches();
+
+	int getMaxVerticesPerWavefront()
+	{
+		return m_maxVerticesWithinWave;
+	}
+
+	int getWavefrontSize()
+	{
+		return m_wavefrontSize;
+	}
+
+	int getLinksPerWorkItem()
+	{
+		return m_linksPerWorkItem;
+	}
+
+	int getMaxLinksPerWavefront()
+	{
+		return m_maxLinksPerWavefront;
+	}
+
+	int getMaxBatchesPerWavefront()
+	{
+		return m_maxBatchesWithinWave;
+	}
+
+	int getNumWavefronts()
+	{
+		return m_numWavefronts;
+	}
+
+	NumBatchesVerticesPair getNumBatchesAndVerticesWithinWavefront( int wavefront )
+	{
+		return m_numBatchesAndVerticesWithinWaves[wavefront];
+	}
+
+	int getVertexGlobalAddresses( int vertexIndex )
+	{
+		return m_wavefrontVerticesGlobalAddresses[vertexIndex];
+	}
+
+	/**
+	 * Get post-batching local addresses of the vertex pair for a link assuming all vertices used by a wavefront are loaded locally.
+	 */
+	LinkNodePair getVertexPairLocalAddresses( int linkIndex )
+	{
+		return m_linkVerticesLocalAddresses[linkIndex];
+	}
+
+};
+
+
+#endif // #ifndef BT_ACCELERATED_SOFT_BODY_LINK_DATA_DX11_SIMDAWARE_H
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
index 9c9b325a8..7877aa6a0 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
@@ -622,7 +622,7 @@ void btDX11SoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &softB
 			using Vectormath::Aos::Point3;
 
 			// Create SoftBody that will store the information within the solver
-			btAcceleratedSoftBodyInterface *newSoftBody = new btAcceleratedSoftBodyInterface( softBody );
+			btDX11AcceleratedSoftBodyInterface *newSoftBody = new btDX11AcceleratedSoftBodyInterface( softBody );
 			m_softBodySet.push_back( newSoftBody );
 
 			m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) );
@@ -1451,11 +1451,11 @@ void btDX11SoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities( float
 
 
 
-btDX11SoftBodySolver::btAcceleratedSoftBodyInterface *btDX11SoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
+btDX11AcceleratedSoftBodyInterface *btDX11SoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
 {
 	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
 	{
-		btAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
+		btDX11AcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
 		if( softBodyInterface->getSoftBody() == softBody )
 			return softBodyInterface;
 	}
@@ -1466,7 +1466,7 @@ void btDX11SoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * const
 {
 	checkInitialized();
 	
-	btAcceleratedSoftBodyInterface *currentCloth = findSoftBodyInterface( softBody );
+	btDX11AcceleratedSoftBodyInterface *currentCloth = findSoftBodyInterface( softBody );
 
 
 	const int firstVertex = currentCloth->getFirstVertex();
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
index a61e5166c..ea5b3d462 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
@@ -13,6 +13,9 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
+#ifndef BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
+#define BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
+
 
 #include "vectormath/vmInclude.h"
 #include "BulletSoftBody/btSoftBodySolvers.h"
@@ -22,185 +25,184 @@ subject to the following restrictions:
 #include "btSoftBodySolverTriangleData_DX11.h"
 
 
-#ifndef BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
-#define BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
+
+/**
+ * SoftBody class to maintain information about a soft body instance
+ * within a solver.
+ * This data addresses the main solver arrays.
+ */
+class btDX11AcceleratedSoftBodyInterface
+{
+protected:
+	/** Current number of vertices that are part of this cloth */
+	int m_numVertices;
+	/** Maximum number of vertices allocated to be part of this cloth */
+	int m_maxVertices;
+	/** Current number of triangles that are part of this cloth */
+	int m_numTriangles;
+	/** Maximum number of triangles allocated to be part of this cloth */
+	int m_maxTriangles;
+	/** Index of first vertex in the world allocated to this cloth */
+	int m_firstVertex;
+	/** Index of first triangle in the world allocated to this cloth */
+	int m_firstTriangle;
+	/** Index of first link in the world allocated to this cloth */
+	int m_firstLink;
+	/** Maximum number of links allocated to this cloth */
+	int m_maxLinks;
+	/** Current number of links allocated to this cloth */
+	int m_numLinks;
+
+	/** The actual soft body this data represents */
+	btSoftBody *m_softBody;
+
+
+public:
+	btDX11AcceleratedSoftBodyInterface( btSoftBody *softBody ) :
+	  m_softBody( softBody )
+	{
+		m_numVertices = 0;
+		m_maxVertices = 0;
+		m_numTriangles = 0;
+		m_maxTriangles = 0;
+		m_firstVertex = 0;
+		m_firstTriangle = 0;
+		m_firstLink = 0;
+		m_maxLinks = 0;
+		m_numLinks = 0;
+	}
+	int getNumVertices()
+	{
+		return m_numVertices;
+	}
+
+	int getNumTriangles()
+	{
+		return m_numTriangles;
+	}
+
+	int getMaxVertices()
+	{
+		return m_maxVertices;
+	}
+
+	int getMaxTriangles()
+	{
+		return m_maxTriangles;
+	}
+
+	int getFirstVertex()
+	{
+		return m_firstVertex;
+	}
+
+	int getFirstTriangle()
+	{
+		return m_firstTriangle;
+	}
+
+	// TODO: All of these set functions will have to do checks and
+	// update the world because restructuring of the arrays will be necessary
+	// Reasonable use of "friend"?
+	void setNumVertices( int numVertices )
+	{
+		m_numVertices = numVertices;
+	}	
+	
+	void setNumTriangles( int numTriangles )
+	{
+		m_numTriangles = numTriangles;
+	}
+
+	void setMaxVertices( int maxVertices )
+	{
+		m_maxVertices = maxVertices;
+	}
+
+	void setMaxTriangles( int maxTriangles )
+	{
+		m_maxTriangles = maxTriangles;
+	}
+
+	void setFirstVertex( int firstVertex )
+	{
+		m_firstVertex = firstVertex;
+	}
+
+	void setFirstTriangle( int firstTriangle )
+	{
+		m_firstTriangle = firstTriangle;
+	}
+
+	void setMaxLinks( int maxLinks )
+	{
+		m_maxLinks = maxLinks;
+	}
+
+	void setNumLinks( int numLinks )
+	{
+		m_numLinks = numLinks;
+	}
+
+	void setFirstLink( int firstLink )
+	{
+		m_firstLink = firstLink;
+	}
+
+	int getMaxLinks()
+	{
+		return m_maxLinks;
+	}
+
+	int getNumLinks()
+	{
+		return m_numLinks;
+	}
+
+	int getFirstLink()
+	{
+		return m_firstLink;
+	}
+
+	btSoftBody* getSoftBody()
+	{
+		return m_softBody;
+	}
+
+#if 0
+	void setAcceleration( Vectormath::Aos::Vector3 acceleration )
+	{
+		m_currentSolver->setPerClothAcceleration( m_clothIdentifier, acceleration );
+	}
+
+	void setWindVelocity( Vectormath::Aos::Vector3 windVelocity )
+	{
+		m_currentSolver->setPerClothWindVelocity( m_clothIdentifier, windVelocity );
+	}
+
+	/** 
+	 * Set the density of the air in which the cloth is situated.
+	 */
+	void setAirDensity( btScalar density )
+	{
+		m_currentSolver->setPerClothMediumDensity( m_clothIdentifier, static_cast<float>(density) );
+	}
+
+	/**
+	 * Add a collision object to this soft body.
+	 */
+	void addCollisionObject( btCollisionObject *collisionObject )
+	{
+		m_currentSolver->addCollisionObjectForSoftBody( m_clothIdentifier, collisionObject );
+	}
+#endif
+};
+
 
 class btDX11SoftBodySolver : public btSoftBodySolver
 {
 public:
 
-	/**
-	 * SoftBody class to maintain information about a soft body instance
-	 * within a solver.
-	 * This data addresses the main solver arrays.
-	 */
-	class btAcceleratedSoftBodyInterface
-	{
-	protected:
-		/** Current number of vertices that are part of this cloth */
-		int m_numVertices;
-		/** Maximum number of vertices allocated to be part of this cloth */
-		int m_maxVertices;
-		/** Current number of triangles that are part of this cloth */
-		int m_numTriangles;
-		/** Maximum number of triangles allocated to be part of this cloth */
-		int m_maxTriangles;
-		/** Index of first vertex in the world allocated to this cloth */
-		int m_firstVertex;
-		/** Index of first triangle in the world allocated to this cloth */
-		int m_firstTriangle;
-		/** Index of first link in the world allocated to this cloth */
-		int m_firstLink;
-		/** Maximum number of links allocated to this cloth */
-		int m_maxLinks;
-		/** Current number of links allocated to this cloth */
-		int m_numLinks;
-
-		/** The actual soft body this data represents */
-		btSoftBody *m_softBody;
-
-
-	public:
-		btAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
-		  m_softBody( softBody )
-		{
-			m_numVertices = 0;
-			m_maxVertices = 0;
-			m_numTriangles = 0;
-			m_maxTriangles = 0;
-			m_firstVertex = 0;
-			m_firstTriangle = 0;
-			m_firstLink = 0;
-			m_maxLinks = 0;
-			m_numLinks = 0;
-		}
-		int getNumVertices()
-		{
-			return m_numVertices;
-		}
-
-		int getNumTriangles()
-		{
-			return m_numTriangles;
-		}
-
-		int getMaxVertices()
-		{
-			return m_maxVertices;
-		}
-
-		int getMaxTriangles()
-		{
-			return m_maxTriangles;
-		}
-
-		int getFirstVertex()
-		{
-			return m_firstVertex;
-		}
-
-		int getFirstTriangle()
-		{
-			return m_firstTriangle;
-		}
-
-		// TODO: All of these set functions will have to do checks and
-		// update the world because restructuring of the arrays will be necessary
-		// Reasonable use of "friend"?
-		void setNumVertices( int numVertices )
-		{
-			m_numVertices = numVertices;
-		}	
-		
-		void setNumTriangles( int numTriangles )
-		{
-			m_numTriangles = numTriangles;
-		}
-
-		void setMaxVertices( int maxVertices )
-		{
-			m_maxVertices = maxVertices;
-		}
-
-		void setMaxTriangles( int maxTriangles )
-		{
-			m_maxTriangles = maxTriangles;
-		}
-
-		void setFirstVertex( int firstVertex )
-		{
-			m_firstVertex = firstVertex;
-		}
-
-		void setFirstTriangle( int firstTriangle )
-		{
-			m_firstTriangle = firstTriangle;
-		}
-
-		void setMaxLinks( int maxLinks )
-		{
-			m_maxLinks = maxLinks;
-		}
-
-		void setNumLinks( int numLinks )
-		{
-			m_numLinks = numLinks;
-		}
-
-		void setFirstLink( int firstLink )
-		{
-			m_firstLink = firstLink;
-		}
-
-		int getMaxLinks()
-		{
-			return m_maxLinks;
-		}
-
-		int getNumLinks()
-		{
-			return m_numLinks;
-		}
-
-		int getFirstLink()
-		{
-			return m_firstLink;
-		}
-
-		btSoftBody* getSoftBody()
-		{
-			return m_softBody;
-		}
-
-	#if 0
-		void setAcceleration( Vectormath::Aos::Vector3 acceleration )
-		{
-			m_currentSolver->setPerClothAcceleration( m_clothIdentifier, acceleration );
-		}
-
-		void setWindVelocity( Vectormath::Aos::Vector3 windVelocity )
-		{
-			m_currentSolver->setPerClothWindVelocity( m_clothIdentifier, windVelocity );
-		}
-
-		/** 
-		 * Set the density of the air in which the cloth is situated.
-		 */
-		void setAirDensity( btScalar density )
-		{
-			m_currentSolver->setPerClothMediumDensity( m_clothIdentifier, static_cast<float>(density) );
-		}
-
-		/**
-		 * Add a collision object to this soft body.
-		 */
-		void addCollisionObject( btCollisionObject *collisionObject )
-		{
-			m_currentSolver->addCollisionObjectForSoftBody( m_clothIdentifier, collisionObject );
-		}
-	#endif
-	};
-
 
 	class KernelDesc
 	{
@@ -344,7 +346,7 @@ private:
 	 * Cloths owned by this solver.
 	 * Only our cloths are in this array.
 	 */
-	btAlignedObjectArray< btAcceleratedSoftBodyInterface * > m_softBodySet;
+	btAlignedObjectArray< btDX11AcceleratedSoftBodyInterface * > m_softBodySet;
 
 	/** Acceleration value to be applied to all non-static vertices in the solver. 
 	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
@@ -429,7 +431,7 @@ private:
 	
 	void updateConstants( float timeStep );
 
-	btAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
+	btDX11AcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
 
 	//////////////////////////////////////
 	// Kernel dispatches
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp
new file mode 100644
index 000000000..c72dead3e
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp
@@ -0,0 +1,1793 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include <cstdio>
+
+
+#define WAVEFRONT_SIZE 32
+#define WAVEFRONT_BLOCK_MULTIPLIER 2
+#define LINKS_PER_SIMD_LANE 16
+
+#define STRINGIFY( S ) STRINGIFY2( S )
+#define STRINGIFY2( S ) #S
+
+#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
+#include "vectormath/vmInclude.h"
+
+#include "btSoftBodySolverLinkData_DX11SIMDAware.h"
+#include "btSoftBodySolver_DX11SIMDAware.h"
+#include "btSoftBodySolverVertexBuffer_DX11.h"
+#include "BulletSoftBody/btSoftBody.h"
+
+#define MSTRINGIFY(A) #A
+static char* PrepareLinksHLSLString = 
+#include "HLSL/PrepareLinks.hlsl"
+static char* UpdatePositionsFromVelocitiesHLSLString = 
+#include "HLSL/UpdatePositionsFromVelocities.hlsl"
+static char* SolvePositionsSIMDBatchedHLSLString = 
+#include "HLSL/SolvePositionsSIMDBatched.hlsl"
+static char* UpdateNodesHLSLString = 
+#include "HLSL/UpdateNodes.hlsl"
+static char* UpdatePositionsHLSLString = 
+#include "HLSL/UpdatePositions.hlsl"
+static char* UpdateConstantsHLSLString = 
+#include "HLSL/UpdateConstants.hlsl"
+static char* IntegrateHLSLString = 
+#include "HLSL/Integrate.hlsl"
+static char* ApplyForcesHLSLString = 
+#include "HLSL/ApplyForces.hlsl"
+static char* UpdateNormalsHLSLString = 
+#include "HLSL/UpdateNormals.hlsl"
+static char* OutputToVertexArrayHLSLString = 
+#include "HLSL/OutputToVertexArray.hlsl"
+static char* VSolveLinksHLSLString = 
+#include "HLSL/VSolveLinks.hlsl"
+
+
+
+btSoftBodyLinkDataDX11SIMDAware::btSoftBodyLinkDataDX11SIMDAware( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext ) : 
+		m_d3dDevice( d3dDevice ),
+		m_d3dDeviceContext( d3dDeviceContext ),
+		m_wavefrontSize( WAVEFRONT_SIZE ),
+		m_linksPerWorkItem( LINKS_PER_SIMD_LANE ),
+		m_maxBatchesWithinWave( 0 ),
+		m_maxLinksPerWavefront( m_wavefrontSize * m_linksPerWorkItem ),
+		m_numWavefronts( 0 ),
+		m_maxVertex( 0 ),
+		m_dx11NumBatchesAndVerticesWithinWaves( d3dDevice, d3dDeviceContext, &m_numBatchesAndVerticesWithinWaves, true ),
+		m_dx11WavefrontVerticesGlobalAddresses( d3dDevice, d3dDeviceContext, &m_wavefrontVerticesGlobalAddresses, true ),
+		m_dx11LinkVerticesLocalAddresses( d3dDevice, d3dDeviceContext, &m_linkVerticesLocalAddresses, true ),
+		m_dx11LinkStrength( d3dDevice, d3dDeviceContext, &m_linkStrength, true ),
+		m_dx11LinksMassLSC( d3dDevice, d3dDeviceContext, &m_linksMassLSC, true ),
+		m_dx11LinksRestLengthSquared( d3dDevice, d3dDeviceContext, &m_linksRestLengthSquared, true ),
+		m_dx11LinksRestLength( d3dDevice, d3dDeviceContext, &m_linksRestLength, true ),
+		m_dx11LinksMaterialLinearStiffnessCoefficient( d3dDevice, d3dDeviceContext, &m_linksMaterialLinearStiffnessCoefficient, true )
+{
+	m_d3dDevice = d3dDevice;
+	m_d3dDeviceContext = d3dDeviceContext;
+}
+
+btSoftBodyLinkDataDX11SIMDAware::~btSoftBodyLinkDataDX11SIMDAware()
+{
+}
+
+static Vectormath::Aos::Vector3 toVector3( const btVector3 &vec )
+{
+	Vectormath::Aos::Vector3 outVec( vec.getX(), vec.getY(), vec.getZ() );
+	return outVec;
+}
+
+void btSoftBodyLinkDataDX11SIMDAware::createLinks( int numLinks )
+{
+	int previousSize = m_links.size();
+	int newSize = previousSize + numLinks;
+
+	btSoftBodyLinkData::createLinks( numLinks );
+
+	// Resize the link addresses array as well
+	m_linkAddresses.resize( newSize );
+}
+
+void btSoftBodyLinkDataDX11SIMDAware::setLinkAt( const btSoftBodyLinkData::LinkDescription &link, int linkIndex )
+{
+	btSoftBodyLinkData::setLinkAt( link, linkIndex );
+
+	if( link.getVertex0() > m_maxVertex )
+		m_maxVertex = link.getVertex0();
+	if( link.getVertex1() > m_maxVertex )
+		m_maxVertex = link.getVertex1();
+
+	// Set the link index correctly for initialisation
+	m_linkAddresses[linkIndex] = linkIndex;
+}
+
+bool btSoftBodyLinkDataDX11SIMDAware::onAccelerator()
+{
+	return m_onGPU;
+}
+
+bool btSoftBodyLinkDataDX11SIMDAware::moveToAccelerator()
+{
+	bool success = true;
+
+	success = success && m_dx11NumBatchesAndVerticesWithinWaves.moveToGPU();
+	success = success && m_dx11WavefrontVerticesGlobalAddresses.moveToGPU();
+	success = success && m_dx11LinkVerticesLocalAddresses.moveToGPU();
+	success = success && m_dx11LinkStrength.moveToGPU();
+	success = success && m_dx11LinksMassLSC.moveToGPU();
+	success = success && m_dx11LinksRestLengthSquared.moveToGPU();
+	success = success && m_dx11LinksRestLength.moveToGPU();
+	success = success && m_dx11LinksMaterialLinearStiffnessCoefficient.moveToGPU();
+
+	if( success )
+		m_onGPU = true;
+
+	return success;
+}
+
+bool btSoftBodyLinkDataDX11SIMDAware::moveFromAccelerator()
+{
+	bool success = true;
+	success = success && m_dx11NumBatchesAndVerticesWithinWaves.moveFromGPU();
+	success = success && m_dx11WavefrontVerticesGlobalAddresses.moveFromGPU();
+	success = success && m_dx11LinkVerticesLocalAddresses.moveFromGPU();
+	success = success && m_dx11LinkStrength.moveFromGPU();
+	success = success && m_dx11LinksMassLSC.moveFromGPU();
+	success = success && m_dx11LinksRestLengthSquared.moveFromGPU();
+	success = success && m_dx11LinksRestLength.moveFromGPU();
+	success = success && m_dx11LinksMaterialLinearStiffnessCoefficient.moveFromGPU();
+
+	if( success )
+		m_onGPU = false;
+
+	return success;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+btDX11SIMDAwareSoftBodySolver::btDX11SIMDAwareSoftBodySolver(ID3D11Device * dx11Device, ID3D11DeviceContext* dx11Context) :
+	m_dx11Device( dx11Device ),
+	m_dx11Context( dx11Context ),
+	m_linkData(m_dx11Device, m_dx11Context),
+	m_vertexData(m_dx11Device, m_dx11Context),
+	m_triangleData(m_dx11Device, m_dx11Context),
+	m_dx11PerClothAcceleration( m_dx11Device, m_dx11Context, &m_perClothAcceleration, true ),
+	m_dx11PerClothWindVelocity( m_dx11Device, m_dx11Context, &m_perClothWindVelocity, true ),
+	m_dx11PerClothDampingFactor( m_dx11Device, m_dx11Context, &m_perClothDampingFactor, true ),
+	m_dx11PerClothVelocityCorrectionCoefficient( m_dx11Device, m_dx11Context, &m_perClothVelocityCorrectionCoefficient, true ),
+	m_dx11PerClothLiftFactor( m_dx11Device, m_dx11Context, &m_perClothLiftFactor, true ),
+	m_dx11PerClothDragFactor( m_dx11Device, m_dx11Context, &m_perClothDragFactor, true ),
+	m_dx11PerClothMediumDensity( m_dx11Device, m_dx11Context, &m_perClothMediumDensity, true )
+{
+	// Initial we will clearly need to update solver constants
+	// For now this is global for the cloths linked with this solver - we should probably make this body specific 
+	// for performance in future once we understand more clearly when constants need to be updated
+	m_updateSolverConstants = true;
+
+	m_shadersInitialized = false;
+}
+
+void btDX11SIMDAwareSoftBodySolver::releaseKernels()
+{
+	SAFE_RELEASE( integrateKernel.constBuffer );
+	SAFE_RELEASE( integrateKernel.kernel );
+	SAFE_RELEASE( solvePositionsFromLinksKernel.constBuffer );
+	SAFE_RELEASE( solvePositionsFromLinksKernel.kernel );
+	SAFE_RELEASE( updatePositionsFromVelocitiesKernel.constBuffer );
+	SAFE_RELEASE( updatePositionsFromVelocitiesKernel.kernel );
+	SAFE_RELEASE( updateVelocitiesFromPositionsWithoutVelocitiesKernel.constBuffer );
+	SAFE_RELEASE( updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel );
+	SAFE_RELEASE( updateVelocitiesFromPositionsWithVelocitiesKernel.constBuffer );
+	SAFE_RELEASE( updateVelocitiesFromPositionsWithVelocitiesKernel.kernel );
+	SAFE_RELEASE( resetNormalsAndAreasKernel.constBuffer );
+	SAFE_RELEASE( resetNormalsAndAreasKernel.kernel );
+	SAFE_RELEASE( normalizeNormalsAndAreasKernel.constBuffer );
+	SAFE_RELEASE( normalizeNormalsAndAreasKernel.kernel );
+	SAFE_RELEASE( updateSoftBodiesKernel.constBuffer );
+	SAFE_RELEASE( updateSoftBodiesKernel.kernel );
+	SAFE_RELEASE( outputToVertexArrayWithNormalsKernel.constBuffer );
+	SAFE_RELEASE( outputToVertexArrayWithNormalsKernel.kernel );
+	SAFE_RELEASE( outputToVertexArrayWithoutNormalsKernel.constBuffer );
+	SAFE_RELEASE( outputToVertexArrayWithoutNormalsKernel.kernel );
+
+
+	SAFE_RELEASE( addVelocityKernel.constBuffer );
+	SAFE_RELEASE( addVelocityKernel.kernel );
+	SAFE_RELEASE( applyForcesKernel.constBuffer );
+	SAFE_RELEASE( applyForcesKernel.kernel );
+	SAFE_RELEASE( outputToVertexArrayKernel.constBuffer );
+	SAFE_RELEASE( outputToVertexArrayKernel.kernel );
+	SAFE_RELEASE( collideCylinderKernel.constBuffer );
+	SAFE_RELEASE( collideCylinderKernel.kernel );	
+
+	m_shadersInitialized = false;
+}
+
+btDX11SIMDAwareSoftBodySolver::~btDX11SIMDAwareSoftBodySolver()
+{
+	releaseKernels();
+}
+
+
+void btDX11SIMDAwareSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &softBodies )
+{
+	if( m_softBodySet.size() != softBodies.size() )
+	{
+		// Have a change in the soft body set so update, reloading all the data
+		getVertexData().clear();
+		getTriangleData().clear();
+		getLinkData().clear();
+		m_softBodySet.resize(0);
+
+
+		for( int softBodyIndex = 0; softBodyIndex < softBodies.size(); ++softBodyIndex )
+		{
+			btSoftBody *softBody = softBodies[ softBodyIndex ];
+			using Vectormath::Aos::Matrix3;
+			using Vectormath::Aos::Point3;
+
+			// Create SoftBody that will store the information within the solver
+			btAcceleratedSoftBodyInterface *newSoftBody = new btAcceleratedSoftBodyInterface( softBody );
+			m_softBodySet.push_back( newSoftBody );
+
+			m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) );
+			m_perClothDampingFactor.push_back(softBody->m_cfg.kDP);
+			m_perClothVelocityCorrectionCoefficient.push_back( softBody->m_cfg.kVCF );
+			m_perClothLiftFactor.push_back( softBody->m_cfg.kLF );
+			m_perClothDragFactor.push_back( softBody->m_cfg.kDG );
+			m_perClothMediumDensity.push_back(softBody->getWorldInfo()->air_density);
+
+			// Add space for new vertices and triangles in the default solver for now
+			// TODO: Include space here for tearing too later
+			int firstVertex = getVertexData().getNumVertices();
+			int numVertices = softBody->m_nodes.size();
+			int maxVertices = numVertices;
+			// Allocate space for new vertices in all the vertex arrays
+			getVertexData().createVertices( maxVertices, softBodyIndex );
+
+			int firstTriangle = getTriangleData().getNumTriangles();
+			int numTriangles = softBody->m_faces.size();
+			int maxTriangles = numTriangles;
+			getTriangleData().createTriangles( maxTriangles );
+
+			// Copy vertices from softbody into the solver
+			for( int vertex = 0; vertex < numVertices; ++vertex )
+			{
+				Point3 multPoint(softBody->m_nodes[vertex].m_x.getX(), softBody->m_nodes[vertex].m_x.getY(), softBody->m_nodes[vertex].m_x.getZ());
+				btSoftBodyVertexData::VertexDescription desc;
+
+				// TODO: Position in the softbody might be pre-transformed
+				// or we may need to adapt for the pose.
+				//desc.setPosition( cloth.getMeshTransform()*multPoint );
+				desc.setPosition( multPoint );
+
+				float vertexInverseMass = softBody->m_nodes[vertex].m_im;
+				desc.setInverseMass(vertexInverseMass);
+				getVertexData().setVertexAt( desc, firstVertex + vertex );
+			}
+
+			// Copy triangles similarly
+			// We're assuming here that vertex indices are based on the firstVertex rather than the entire scene
+			for( int triangle = 0; triangle < numTriangles; ++triangle )
+			{
+				// Note that large array storage is relative to the array not to the cloth
+				// So we need to add firstVertex to each value
+				int vertexIndex0 = (softBody->m_faces[triangle].m_n[0] - &(softBody->m_nodes[0]));
+				int vertexIndex1 = (softBody->m_faces[triangle].m_n[1] - &(softBody->m_nodes[0]));
+				int vertexIndex2 = (softBody->m_faces[triangle].m_n[2] - &(softBody->m_nodes[0]));
+				btSoftBodyTriangleData::TriangleDescription newTriangle(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, vertexIndex2 + firstVertex);
+				getTriangleData().setTriangleAt( newTriangle, firstTriangle + triangle );
+				
+				// Increase vertex triangle counts for this triangle		
+				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex0)++;
+				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex1)++;
+				getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex2)++;
+			}
+
+			int firstLink = getLinkData().getNumLinks();
+			int numLinks = softBody->m_links.size();
+			int maxLinks = numLinks;
+			
+			// Allocate space for the links
+			getLinkData().createLinks( numLinks );
+
+			// Add the links
+			for( int link = 0; link < numLinks; ++link )
+			{
+				int vertexIndex0 = softBody->m_links[link].m_n[0] - &(softBody->m_nodes[0]);
+				int vertexIndex1 = softBody->m_links[link].m_n[1] - &(softBody->m_nodes[0]);
+
+				btSoftBodyLinkData::LinkDescription newLink(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, softBody->m_links[link].m_material->m_kLST);
+				newLink.setLinkStrength(1.f);
+				getLinkData().setLinkAt(newLink, firstLink + link);
+			}
+			
+			newSoftBody->setFirstVertex( firstVertex );
+			newSoftBody->setFirstTriangle( firstTriangle );
+			newSoftBody->setNumVertices( numVertices );
+			newSoftBody->setMaxVertices( maxVertices );
+			newSoftBody->setNumTriangles( numTriangles );
+			newSoftBody->setMaxTriangles( maxTriangles );
+			newSoftBody->setFirstLink( firstLink );
+			newSoftBody->setNumLinks( numLinks );
+		}
+
+
+
+		updateConstants(0.f);
+
+
+		m_linkData.generateBatches();		
+		m_triangleData.generateBatches();
+
+		
+		// Build the shaders to match the batching parameters
+		buildShaders();
+	}
+
+}
+
+
+btSoftBodyLinkData &btDX11SIMDAwareSoftBodySolver::getLinkData()
+{
+	// TODO: Consider setting link data to "changed" here
+	return m_linkData;
+}
+
+btSoftBodyVertexData &btDX11SIMDAwareSoftBodySolver::getVertexData()
+{
+	// TODO: Consider setting vertex data to "changed" here
+	return m_vertexData;
+}
+
+btSoftBodyTriangleData &btDX11SIMDAwareSoftBodySolver::getTriangleData()
+{
+	// TODO: Consider setting triangle data to "changed" here
+	return m_triangleData;
+}
+
+
+bool btDX11SIMDAwareSoftBodySolver::checkInitialized()
+{
+	if( !m_shadersInitialized )
+		if( buildShaders() )
+			m_shadersInitialized = true;
+
+	return m_shadersInitialized;
+}
+
+void btDX11SIMDAwareSoftBodySolver::resetNormalsAndAreas( int numVertices )
+{
+	// No need to batch link solver, it is entirely parallel
+	// Copy kernel parameters to GPU
+	UpdateSoftBodiesCB constBuffer;
+	
+	constBuffer.numNodes = numVertices;
+	constBuffer.epsilon = FLT_EPSILON;
+	
+	// Todo: factor this out. Number of nodes is static and sdt might be, too, we can update this just once on setup
+	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
+	m_dx11Context->Map( integrateKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
+	memcpy( MappedResource.pData, &constBuffer, sizeof(UpdateSoftBodiesCB) );	
+	m_dx11Context->Unmap( integrateKernel.constBuffer, 0 );
+	m_dx11Context->CSSetConstantBuffers( 0, 1, &integrateKernel.constBuffer );
+
+	// Set resources and dispatch
+	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexNormal.getUAV()), NULL );
+	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexArea.getUAV()), NULL );
+
+	// Execute the kernel
+	m_dx11Context->CSSetShader( resetNormalsAndAreasKernel.kernel, NULL, 0 );
+
+	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
+	m_dx11Context->Dispatch(numBlocks, 1, 1 );
+
+	{
+		// Tidy up 
+		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
+		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
+		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
+
+		ID3D11Buffer *pBufferNull = NULL;
+		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
+	}	
+} // btDX11SIMDAwareSoftBodySolver::resetNormalsAndAreas
+
+void btDX11SIMDAwareSoftBodySolver::normalizeNormalsAndAreas( int numVertices )
+{
+	// No need to batch link solver, it is entirely parallel
+	// Copy kernel parameters to GPU
+	UpdateSoftBodiesCB constBuffer;
+	
+	constBuffer.numNodes = numVertices;
+	constBuffer.epsilon = FLT_EPSILON;
+	
+	// Todo: factor this out. Number of nodes is static and sdt might be, too, we can update this just once on setup
+	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
+	m_dx11Context->Map( integrateKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
+	memcpy( MappedResource.pData, &constBuffer, sizeof(UpdateSoftBodiesCB) );	
+	m_dx11Context->Unmap( integrateKernel.constBuffer, 0 );
+	m_dx11Context->CSSetConstantBuffers( 0, 1, &integrateKernel.constBuffer );
+
+	// Set resources and dispatch	
+	m_dx11Context->CSSetShaderResources( 2, 1, &(m_vertexData.m_dx11VertexTriangleCount.getSRV()) );
+
+	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexNormal.getUAV()), NULL );
+	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexArea.getUAV()), NULL );
+
+	// Execute the kernel
+	m_dx11Context->CSSetShader( normalizeNormalsAndAreasKernel.kernel, NULL, 0 );
+
+	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
+	m_dx11Context->Dispatch(numBlocks, 1, 1 );
+
+	{
+		// Tidy up 
+		ID3D11ShaderResourceView* pViewNULL = NULL;
+		m_dx11Context->CSSetShaderResources( 2, 1, &pViewNULL );
+
+		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
+		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
+		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
+
+		ID3D11Buffer *pBufferNull = NULL;
+		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
+	}	
+} // btDX11SIMDAwareSoftBodySolver::normalizeNormalsAndAreas
+
+void btDX11SIMDAwareSoftBodySolver::executeUpdateSoftBodies( int firstTriangle, int numTriangles )
+{
+	// No need to batch link solver, it is entirely parallel
+	// Copy kernel parameters to GPU
+	UpdateSoftBodiesCB constBuffer;
+	
+	constBuffer.startFace = firstTriangle;
+	constBuffer.numFaces = numTriangles;
+	
+	// Todo: factor this out. Number of nodes is static and sdt might be, too, we can update this just once on setup
+	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
+	m_dx11Context->Map( updateSoftBodiesKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
+	memcpy( MappedResource.pData, &constBuffer, sizeof(UpdateSoftBodiesCB) );	
+	m_dx11Context->Unmap( updateSoftBodiesKernel.constBuffer, 0 );
+	m_dx11Context->CSSetConstantBuffers( 0, 1, &updateSoftBodiesKernel.constBuffer );
+
+	// Set resources and dispatch	
+	m_dx11Context->CSSetShaderResources( 0, 1, &(m_triangleData.m_dx11VertexIndices.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 1, 1, &(m_vertexData.m_dx11VertexPosition.getSRV()) );
+
+	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexNormal.getUAV()), NULL );
+	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexArea.getUAV()), NULL );
+	m_dx11Context->CSSetUnorderedAccessViews( 2, 1, &(m_triangleData.m_dx11Normal.getUAV()), NULL );
+	m_dx11Context->CSSetUnorderedAccessViews( 3, 1, &(m_triangleData.m_dx11Area.getUAV()), NULL );
+
+	// Execute the kernel
+	m_dx11Context->CSSetShader( updateSoftBodiesKernel.kernel, NULL, 0 );
+
+	int	numBlocks = (numTriangles + (128-1)) / 128;
+	m_dx11Context->Dispatch(numBlocks, 1, 1 );
+
+	{
+		// Tidy up 
+		ID3D11ShaderResourceView* pViewNULL = NULL;
+		m_dx11Context->CSSetShaderResources( 4, 1, &pViewNULL );
+
+		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
+		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
+		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
+
+		ID3D11Buffer *pBufferNull = NULL;
+		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
+	}	
+} // btDX11SIMDAwareSoftBodySolver::executeUpdateSoftBodies
+
+void btDX11SIMDAwareSoftBodySolver::updateSoftBodies()
+{
+	using namespace Vectormath::Aos;
+
+
+	int numVertices = m_vertexData.getNumVertices();
+	int numTriangles = m_triangleData.getNumTriangles();
+
+	// Ensure data is on accelerator
+	m_vertexData.moveToAccelerator();
+	m_triangleData.moveToAccelerator();
+
+	resetNormalsAndAreas( numVertices );
+
+
+	// Go through triangle batches so updates occur correctly
+	for( int batchIndex = 0; batchIndex < m_triangleData.m_batchStartLengths.size(); ++batchIndex )
+	{
+
+		int startTriangle = m_triangleData.m_batchStartLengths[batchIndex].start;
+		int numTriangles = m_triangleData.m_batchStartLengths[batchIndex].length;
+
+		executeUpdateSoftBodies( startTriangle, numTriangles );
+	}
+
+
+	normalizeNormalsAndAreas( numVertices );
+
+} // btDX11SIMDAwareSoftBodySolver::updateSoftBodies
+
+
+Vectormath::Aos::Vector3 btDX11SIMDAwareSoftBodySolver::ProjectOnAxis( const Vectormath::Aos::Vector3 &v, const Vectormath::Aos::Vector3 &a )
+{
+	return a*Vectormath::Aos::dot(v, a);
+}
+
+void btDX11SIMDAwareSoftBodySolver::ApplyClampedForce( float solverdt, const Vectormath::Aos::Vector3 &force, const Vectormath::Aos::Vector3 &vertexVelocity, float inverseMass, Vectormath::Aos::Vector3 &vertexForce )
+{
+	float dtInverseMass = solverdt*inverseMass;
+	if( Vectormath::Aos::lengthSqr(force * dtInverseMass) > Vectormath::Aos::lengthSqr(vertexVelocity) )
+	{
+		vertexForce -= ProjectOnAxis( vertexVelocity, normalize( force ) )/dtInverseMass;
+	} else {
+		vertexForce += force;
+	}
+}
+
+void btDX11SIMDAwareSoftBodySolver::applyForces( float solverdt )
+{		
+	using namespace Vectormath::Aos;
+	
+	// Ensure data is on accelerator
+	m_vertexData.moveToAccelerator();
+	m_dx11PerClothAcceleration.moveToGPU();
+	m_dx11PerClothLiftFactor.moveToGPU();
+	m_dx11PerClothDragFactor.moveToGPU();
+	m_dx11PerClothMediumDensity.moveToGPU();
+	m_dx11PerClothWindVelocity.moveToGPU();
+
+	// No need to batch link solver, it is entirely parallel
+	// Copy kernel parameters to GPU
+	ApplyForcesCB constBuffer;
+	
+	constBuffer.numNodes = m_vertexData.getNumVertices();
+	constBuffer.solverdt = solverdt;
+	constBuffer.epsilon = FLT_EPSILON;
+	
+	// Todo: factor this out. Number of nodes is static and sdt might be, too, we can update this just once on setup
+	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
+	m_dx11Context->Map( integrateKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
+	memcpy( MappedResource.pData, &constBuffer, sizeof(ApplyForcesCB) );	
+	m_dx11Context->Unmap( integrateKernel.constBuffer, 0 );
+	m_dx11Context->CSSetConstantBuffers( 0, 1, &integrateKernel.constBuffer );
+
+	// Set resources and dispatch	
+	m_dx11Context->CSSetShaderResources( 0, 1, &(m_vertexData.m_dx11ClothIdentifier.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 1, 1, &(m_vertexData.m_dx11VertexNormal.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 2, 1, &(m_vertexData.m_dx11VertexArea.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 3, 1, &(m_vertexData.m_dx11VertexInverseMass.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 4, 1, &(m_dx11PerClothLiftFactor.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 5, 1, &(m_dx11PerClothDragFactor.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 6, 1, &(m_dx11PerClothWindVelocity.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 7, 1, &(m_dx11PerClothAcceleration.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 8, 1, &(m_dx11PerClothMediumDensity.getSRV()) );
+
+	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexForceAccumulator.getUAV()), NULL );
+	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexVelocity.getUAV()), NULL );
+
+	// Execute the kernel
+	m_dx11Context->CSSetShader( applyForcesKernel.kernel, NULL, 0 );
+
+	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
+	m_dx11Context->Dispatch(numBlocks, 1, 1 );
+
+	{
+		// Tidy up 
+		ID3D11ShaderResourceView* pViewNULL = NULL;
+		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 2, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 3, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 4, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 5, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 6, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 7, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 8, 1, &pViewNULL );
+
+		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
+		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
+		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
+
+		ID3D11Buffer *pBufferNull = NULL;
+		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
+	}	
+
+
+} // btDX11SIMDAwareSoftBodySolver::applyForces
+
+/**
+ * Integrate motion on the solver.
+ */
+void btDX11SIMDAwareSoftBodySolver::integrate( float solverdt )
+{
+	// TEMPORARY COPIES
+	m_vertexData.moveToAccelerator();
+
+	// No need to batch link solver, it is entirely parallel
+	// Copy kernel parameters to GPU
+	IntegrateCB constBuffer;
+	
+	constBuffer.numNodes = m_vertexData.getNumVertices();
+	constBuffer.solverdt = solverdt;
+	
+	// Todo: factor this out. Number of nodes is static and sdt might be, too, we can update this just once on setup
+	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
+	m_dx11Context->Map( integrateKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
+	memcpy( MappedResource.pData, &constBuffer, sizeof(IntegrateCB) );	
+	m_dx11Context->Unmap( integrateKernel.constBuffer, 0 );
+	m_dx11Context->CSSetConstantBuffers( 0, 1, &integrateKernel.constBuffer );
+
+	// Set resources and dispatch
+	m_dx11Context->CSSetShaderResources( 0, 1, &(m_vertexData.m_dx11VertexInverseMass.getSRV()) );
+
+	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexPosition.getUAV()), NULL );
+	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexVelocity.getUAV()), NULL );
+	m_dx11Context->CSSetUnorderedAccessViews( 2, 1, &(m_vertexData.m_dx11VertexPreviousPosition.getUAV()), NULL );
+	m_dx11Context->CSSetUnorderedAccessViews( 3, 1, &(m_vertexData.m_dx11VertexForceAccumulator.getUAV()), NULL );
+
+	// Execute the kernel
+	m_dx11Context->CSSetShader( integrateKernel.kernel, NULL, 0 );
+
+	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
+	m_dx11Context->Dispatch(numBlocks, 1, 1 );
+
+	{
+		// Tidy up 
+		ID3D11ShaderResourceView* pViewNULL = NULL;
+		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
+
+		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
+		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
+		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
+		m_dx11Context->CSSetUnorderedAccessViews( 2, 1, &pUAViewNULL, NULL );
+		m_dx11Context->CSSetUnorderedAccessViews( 3, 1, &pUAViewNULL, NULL );
+
+		ID3D11Buffer *pBufferNull = NULL;
+		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
+	}	
+} // btDX11SIMDAwareSoftBodySolver::integrate
+
+float btDX11SIMDAwareSoftBodySolver::computeTriangleArea( 
+	const Vectormath::Aos::Point3 &vertex0,
+	const Vectormath::Aos::Point3 &vertex1,
+	const Vectormath::Aos::Point3 &vertex2 )
+{
+	Vectormath::Aos::Vector3 a = vertex1 - vertex0;
+	Vectormath::Aos::Vector3 b = vertex2 - vertex0;
+	Vectormath::Aos::Vector3 crossProduct = cross(a, b);
+	float area = length( crossProduct );
+	return area;
+} // btDX11SIMDAwareSoftBodySolver::computeTriangleArea
+
+// Update constants here is a simple CPU version that is run on optimize
+void btDX11SIMDAwareSoftBodySolver::updateConstants( float timeStep )
+{
+	using namespace Vectormath::Aos;
+
+	if( m_updateSolverConstants )
+	{
+		m_updateSolverConstants = false;
+
+		// Will have to redo this if we change the structure (tear, maybe) or various other possible changes
+
+		// Initialise link constants
+		const int numLinks = m_linkData.getNumLinks();
+		for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex )
+		{
+			btSoftBodyLinkData::LinkNodePair &vertices( m_linkData.getVertexPair(linkIndex) );
+			m_linkData.getRestLength(linkIndex) = length((m_vertexData.getPosition( vertices.vertex0 ) - m_vertexData.getPosition( vertices.vertex1 )));
+			float invMass0 = m_vertexData.getInverseMass(vertices.vertex0);
+			float invMass1 = m_vertexData.getInverseMass(vertices.vertex1);
+			float linearStiffness = m_linkData.getLinearStiffnessCoefficient(linkIndex);
+			float massLSC = (invMass0 + invMass1)/linearStiffness;
+			m_linkData.getMassLSC(linkIndex) = massLSC;
+			float restLength = m_linkData.getRestLength(linkIndex);
+			float restLengthSquared = restLength*restLength;
+			m_linkData.getRestLengthSquared(linkIndex) = restLengthSquared;
+		}
+	}
+} // btDX11SIMDAwareSoftBodySolver::updateConstants
+
+
+
+void btDX11SIMDAwareSoftBodySolver::solveConstraints( float solverdt )
+{
+
+	//std::cerr << "'GPU' solve constraints\n";
+	using Vectormath::Aos::Vector3;
+	using Vectormath::Aos::Point3;
+	using Vectormath::Aos::lengthSqr;
+	using Vectormath::Aos::dot;
+
+	// Prepare links
+	int numLinks = m_linkData.getNumLinks();
+	int numVertices = m_vertexData.getNumVertices();
+
+	float kst = 1.f;
+	float ti = 0.f;
+
+
+	m_dx11PerClothDampingFactor.moveToGPU();
+	m_dx11PerClothVelocityCorrectionCoefficient.moveToGPU();
+
+	
+
+	// Ensure data is on accelerator
+	m_linkData.moveToAccelerator();
+	m_vertexData.moveToAccelerator();
+
+	// Solve drift
+  	for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
+	{
+ 		int it = iteration; 
+
+		for( int i = 0; i < m_linkData.m_wavefrontBatchStartLengths.size(); ++i )
+		{
+			int startWave = m_linkData.m_wavefrontBatchStartLengths[i].start;
+			int numWaves = m_linkData.m_wavefrontBatchStartLengths[i].length;
+
+			solveLinksForPosition( startWave, numWaves, kst, ti );
+		}	
+
+	} // for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
+
+
+
+
+	updateVelocitiesFromPositionsWithoutVelocities( 1.f/solverdt );
+
+} // btDX11SIMDAwareSoftBodySolver::solveConstraints
+
+
+
+
+//////////////////////////////////////
+// Kernel dispatches
+
+
+void btDX11SIMDAwareSoftBodySolver::updatePositionsFromVelocities( float solverdt )
+{
+	// No need to batch link solver, it is entirely parallel
+	// Copy kernel parameters to GPU
+	UpdatePositionsFromVelocitiesCB constBuffer;
+	
+	constBuffer.numNodes = m_vertexData.getNumVertices();
+	constBuffer.solverSDT = solverdt;
+	
+	// Todo: factor this out. Number of nodes is static and sdt might be, too, we can update this just once on setup
+	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
+	m_dx11Context->Map( updatePositionsFromVelocitiesKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
+	memcpy( MappedResource.pData, &constBuffer, sizeof(UpdatePositionsFromVelocitiesCB) );	
+	m_dx11Context->Unmap( updatePositionsFromVelocitiesKernel.constBuffer, 0 );
+	m_dx11Context->CSSetConstantBuffers( 0, 1, &updatePositionsFromVelocitiesKernel.constBuffer );
+
+	// Set resources and dispatch			
+	m_dx11Context->CSSetShaderResources( 0, 1, &(m_vertexData.m_dx11VertexVelocity.getSRV()) );
+
+	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexPreviousPosition.getUAV()), NULL );
+	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexPosition.getUAV()), NULL );
+
+	// Execute the kernel
+	m_dx11Context->CSSetShader( updatePositionsFromVelocitiesKernel.kernel, NULL, 0 );
+
+	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
+	m_dx11Context->Dispatch(numBlocks, 1, 1 );
+
+	{
+		// Tidy up 
+		ID3D11ShaderResourceView* pViewNULL = NULL;
+		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
+
+		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
+		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
+		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
+
+		ID3D11Buffer *pBufferNull = NULL;
+		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
+	}	
+} // btDX11SIMDAwareSoftBodySolver::updatePositionsFromVelocities
+
+
+void btDX11SIMDAwareSoftBodySolver::solveLinksForPosition( int startWave, int numWaves, float kst, float ti )
+{
+
+
+	m_vertexData.moveToAccelerator();
+	m_linkData.moveToAccelerator();
+
+	// Copy kernel parameters to GPU
+	SolvePositionsFromLinksKernelCB constBuffer;
+
+	// Set the first wave of the batch and the number of waves
+	constBuffer.startWave = startWave;
+	constBuffer.numWaves = numWaves;
+
+	constBuffer.kst = kst;
+	constBuffer.ti = ti;
+	
+	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
+	m_dx11Context->Map( solvePositionsFromLinksKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
+	memcpy( MappedResource.pData, &constBuffer, sizeof(SolvePositionsFromLinksKernelCB) );	
+	m_dx11Context->Unmap( solvePositionsFromLinksKernel.constBuffer, 0 );
+	m_dx11Context->CSSetConstantBuffers( 0, 1, &solvePositionsFromLinksKernel.constBuffer );
+
+	// Set resources and dispatch
+	m_dx11Context->CSSetShaderResources( 0, 1, &(m_linkData.m_dx11NumBatchesAndVerticesWithinWaves.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 1, 1, &(m_linkData.m_dx11WavefrontVerticesGlobalAddresses.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 2, 1, &(m_vertexData.m_dx11VertexInverseMass.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 3, 1, &(m_linkData.m_dx11LinkVerticesLocalAddresses.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 4, 1, &(m_linkData.m_dx11LinksMassLSC.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 5, 1, &(m_linkData.m_dx11LinksRestLengthSquared.getSRV()) );
+	
+	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexPosition.getUAV()), NULL );
+
+	// Execute the kernel
+	m_dx11Context->CSSetShader( solvePositionsFromLinksKernel.kernel, NULL, 0 );
+
+	int	numBlocks = ((constBuffer.numWaves + WAVEFRONT_BLOCK_MULTIPLIER - 1) / WAVEFRONT_BLOCK_MULTIPLIER );
+	m_dx11Context->Dispatch(numBlocks , 1, 1 );
+
+	{
+		// Tidy up 
+		ID3D11ShaderResourceView* pViewNULL = NULL;
+		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 2, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 3, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 4, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 5, 1, &pViewNULL );
+
+		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
+		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
+
+		ID3D11Buffer *pBufferNull = NULL;
+		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
+	}	
+} // btDX11SIMDAwareSoftBodySolver::solveLinksForPosition
+
+
+void btDX11SIMDAwareSoftBodySolver::updateVelocitiesFromPositionsWithVelocities( float isolverdt )
+{
+	// Copy kernel parameters to GPU
+	UpdateVelocitiesFromPositionsWithVelocitiesCB constBuffer;
+
+	// Set the first link of the batch
+	// and the batch size
+	constBuffer.numNodes = m_vertexData.getNumVertices();
+	constBuffer.isolverdt = isolverdt;
+
+	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
+	m_dx11Context->Map( updateVelocitiesFromPositionsWithVelocitiesKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
+	memcpy( MappedResource.pData, &constBuffer, sizeof(UpdateVelocitiesFromPositionsWithVelocitiesCB) );	
+	m_dx11Context->Unmap( updateVelocitiesFromPositionsWithVelocitiesKernel.constBuffer, 0 );
+	m_dx11Context->CSSetConstantBuffers( 0, 1, &updateVelocitiesFromPositionsWithVelocitiesKernel.constBuffer );
+
+	// Set resources and dispatch
+	m_dx11Context->CSSetShaderResources( 0, 1, &(m_vertexData.m_dx11VertexPosition.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 1, 1, &(m_vertexData.m_dx11VertexPreviousPosition.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 2, 1, &(m_vertexData.m_dx11ClothIdentifier.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 3, 1, &(m_dx11PerClothVelocityCorrectionCoefficient.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 4, 1, &(m_dx11PerClothDampingFactor.getSRV()) );
+
+	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexVelocity.getUAV()), NULL );
+	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexForceAccumulator.getUAV()), NULL );
+
+
+	// Execute the kernel
+	m_dx11Context->CSSetShader( updateVelocitiesFromPositionsWithVelocitiesKernel.kernel, NULL, 0 );
+
+	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
+	m_dx11Context->Dispatch(numBlocks , 1, 1 );
+
+	{
+		// Tidy up 
+		ID3D11ShaderResourceView* pViewNULL = NULL;
+		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 2, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 3, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 4, 1, &pViewNULL );
+
+		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
+		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
+		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
+
+		ID3D11Buffer *pBufferNull = NULL;
+		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
+	}	
+
+} // btDX11SIMDAwareSoftBodySolver::updateVelocitiesFromPositionsWithVelocities
+
+void btDX11SIMDAwareSoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities( float isolverdt )
+{
+	// Copy kernel parameters to GPU
+	UpdateVelocitiesFromPositionsWithoutVelocitiesCB constBuffer;
+
+	// Set the first link of the batch
+	// and the batch size
+	constBuffer.numNodes = m_vertexData.getNumVertices();
+	constBuffer.isolverdt = isolverdt;
+
+	D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
+	m_dx11Context->Map( updateVelocitiesFromPositionsWithoutVelocitiesKernel.constBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
+	memcpy( MappedResource.pData, &constBuffer, sizeof(UpdateVelocitiesFromPositionsWithoutVelocitiesCB) );	
+	m_dx11Context->Unmap( updateVelocitiesFromPositionsWithoutVelocitiesKernel.constBuffer, 0 );
+	m_dx11Context->CSSetConstantBuffers( 0, 1, &updateVelocitiesFromPositionsWithoutVelocitiesKernel.constBuffer );
+
+	// Set resources and dispatch
+	m_dx11Context->CSSetShaderResources( 0, 1, &(m_vertexData.m_dx11VertexPosition.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 1, 1, &(m_vertexData.m_dx11VertexPreviousPosition.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 2, 1, &(m_vertexData.m_dx11ClothIdentifier.getSRV()) );
+	m_dx11Context->CSSetShaderResources( 3, 1, &(m_dx11PerClothDampingFactor.getSRV()) );
+
+	m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(m_vertexData.m_dx11VertexVelocity.getUAV()), NULL );
+	m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &(m_vertexData.m_dx11VertexForceAccumulator.getUAV()), NULL );
+
+
+	// Execute the kernel
+	m_dx11Context->CSSetShader( updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel, NULL, 0 );
+
+	int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
+	m_dx11Context->Dispatch(numBlocks , 1, 1 );
+
+	{
+		// Tidy up 
+		ID3D11ShaderResourceView* pViewNULL = NULL;
+		m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 2, 1, &pViewNULL );
+		m_dx11Context->CSSetShaderResources( 3, 1, &pViewNULL );
+
+		ID3D11UnorderedAccessView* pUAViewNULL = NULL;
+		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
+		m_dx11Context->CSSetUnorderedAccessViews( 1, 1, &pUAViewNULL, NULL );
+
+		ID3D11Buffer *pBufferNull = NULL;
+		m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
+	}	
+
+} // btDX11SIMDAwareSoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities
+
+// End kernel dispatches
+/////////////////////////////////////
+
+
+
+
+
+
+
+
+
+btDX11SIMDAwareSoftBodySolver::btAcceleratedSoftBodyInterface *btDX11SIMDAwareSoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
+{
+	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
+	{
+		btAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
+		if( softBodyInterface->getSoftBody() == softBody )
+			return softBodyInterface;
+	}
+	return 0;
+}
+
+void btDX11SIMDAwareSoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer )
+{
+	checkInitialized();
+	
+	btAcceleratedSoftBodyInterface *currentCloth = findSoftBodyInterface( softBody );
+
+	const int firstVertex = currentCloth->getFirstVertex();
+	const int lastVertex = firstVertex + currentCloth->getNumVertices();
+
+	if( vertexBuffer->getBufferType() == btVertexBufferDescriptor::CPU_BUFFER )
+	{		
+		// If we're doing a CPU-buffer copy must copy the data back to the host first
+		m_vertexData.m_dx11VertexPosition.copyFromGPU();
+		m_vertexData.m_dx11VertexNormal.copyFromGPU();
+
+		const int firstVertex = currentCloth->getFirstVertex();
+		const int lastVertex = firstVertex + currentCloth->getNumVertices();
+		const btCPUVertexBufferDescriptor *cpuVertexBuffer = static_cast< btCPUVertexBufferDescriptor* >(vertexBuffer);						
+		float *basePointer = cpuVertexBuffer->getBasePointer();						
+
+		if( vertexBuffer->hasVertexPositions() )
+		{
+			const int vertexOffset = cpuVertexBuffer->getVertexOffset();
+			const int vertexStride = cpuVertexBuffer->getVertexStride();
+			float *vertexPointer = basePointer + vertexOffset;
+
+			for( int vertexIndex = firstVertex; vertexIndex < lastVertex; ++vertexIndex )
+			{
+				Vectormath::Aos::Point3 position = m_vertexData.getPosition(vertexIndex);
+				*(vertexPointer + 0) = position.getX();
+				*(vertexPointer + 1) = position.getY();
+				*(vertexPointer + 2) = position.getZ();
+				vertexPointer += vertexStride;
+			}
+		}
+		if( vertexBuffer->hasNormals() )
+		{
+			const int normalOffset = cpuVertexBuffer->getNormalOffset();
+			const int normalStride = cpuVertexBuffer->getNormalStride();
+			float *normalPointer = basePointer + normalOffset;
+
+			for( int vertexIndex = firstVertex; vertexIndex < lastVertex; ++vertexIndex )
+			{
+				Vectormath::Aos::Vector3 normal = m_vertexData.getNormal(vertexIndex);
+				*(normalPointer + 0) = normal.getX();
+				*(normalPointer + 1) = normal.getY();
+				*(normalPointer + 2) = normal.getZ();
+				normalPointer += normalStride;
+			}
+		}
+	} else 	if( vertexBuffer->getBufferType() == btVertexBufferDescriptor::DX11_BUFFER )
+	{
+		// Do a DX11 copy shader DX to DX copy
+
+		const btDX11VertexBufferDescriptor *dx11VertexBuffer = static_cast< btDX11VertexBufferDescriptor* >(vertexBuffer);	
+
+		// No need to batch link solver, it is entirely parallel
+		// Copy kernel parameters to GPU
+		OutputToVertexArrayCB constBuffer;
+		ID3D11ComputeShader* outputToVertexArrayShader = outputToVertexArrayWithoutNormalsKernel.kernel;
+		ID3D11Buffer* outputToVertexArrayConstBuffer = outputToVertexArrayWithoutNormalsKernel.constBuffer;
+		
+		constBuffer.startNode = firstVertex;
+		constBuffer.numNodes = currentCloth->getNumVertices();
+		constBuffer.positionOffset = vertexBuffer->getVertexOffset();
+		constBuffer.positionStride = vertexBuffer->getVertexStride();
+		if( vertexBuffer->hasNormals() )
+		{
+			constBuffer.normalOffset = vertexBuffer->getNormalOffset();
+			constBuffer.normalStride = vertexBuffer->getNormalStride();
+			outputToVertexArrayShader = outputToVertexArrayWithNormalsKernel.kernel;
+			outputToVertexArrayConstBuffer = outputToVertexArrayWithNormalsKernel.constBuffer;
+		}	
+		
+		// TODO: factor this out. Number of nodes is static and sdt might be, too, we can update this just once on setup
+		D3D11_MAPPED_SUBRESOURCE MappedResource = {0};
+		m_dx11Context->Map( outputToVertexArrayConstBuffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
+		memcpy( MappedResource.pData, &constBuffer, sizeof(OutputToVertexArrayCB) );	
+		m_dx11Context->Unmap( outputToVertexArrayConstBuffer, 0 );
+		m_dx11Context->CSSetConstantBuffers( 0, 1, &outputToVertexArrayConstBuffer );
+
+		// Set resources and dispatch
+		m_dx11Context->CSSetShaderResources( 0, 1, &(m_vertexData.m_dx11VertexPosition.getSRV()) );
+		m_dx11Context->CSSetShaderResources( 1, 1, &(m_vertexData.m_dx11VertexNormal.getSRV()) );
+
+		ID3D11UnorderedAccessView* dx11UAV = dx11VertexBuffer->getDX11UAV();
+		m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &(dx11UAV), NULL );
+
+		// Execute the kernel
+		m_dx11Context->CSSetShader( outputToVertexArrayShader, NULL, 0 );
+
+		int	numBlocks = (constBuffer.numNodes + (128-1)) / 128;
+		m_dx11Context->Dispatch(numBlocks, 1, 1 );
+
+		{
+			// Tidy up 
+			ID3D11ShaderResourceView* pViewNULL = NULL;
+			m_dx11Context->CSSetShaderResources( 0, 1, &pViewNULL );
+			m_dx11Context->CSSetShaderResources( 1, 1, &pViewNULL );
+
+			ID3D11UnorderedAccessView* pUAViewNULL = NULL;
+			m_dx11Context->CSSetUnorderedAccessViews( 0, 1, &pUAViewNULL, NULL );
+
+			ID3D11Buffer *pBufferNull = NULL;
+			m_dx11Context->CSSetConstantBuffers( 0, 1, &pBufferNull );
+		}	
+	}
+
+} // btDX11SoftBodySolver::outputToVertexBuffers
+
+
+
+
+
+btDX11SIMDAwareSoftBodySolver::KernelDesc btDX11SIMDAwareSoftBodySolver::compileComputeShaderFromString( const char* shaderString, const char* shaderName, int constBufferSize, D3D10_SHADER_MACRO *compileMacros )
+{
+	const char *cs5String = "cs_5_0";
+
+	HRESULT hr = S_OK;
+	ID3DBlob* pErrorBlob = NULL;
+	ID3DBlob* pBlob = NULL;
+	ID3D11ComputeShader*		kernelPointer = 0;
+
+	hr = D3DX11CompileFromMemory( 
+		shaderString,
+		strlen(shaderString),
+		shaderName,
+		compileMacros,
+		NULL,
+		shaderName,
+		cs5String,
+		D3D10_SHADER_ENABLE_STRICTNESS,
+		NULL,
+		NULL,
+		&pBlob,
+		&pErrorBlob,
+		NULL
+		);
+
+	if( FAILED(hr) )
+	{
+		if( pErrorBlob ) {
+			btAssert( "Compilation of compute shader failed\n" );
+			char *debugString = (char*)pErrorBlob->GetBufferPointer();
+			OutputDebugStringA( debugString );
+		}
+	
+		SAFE_RELEASE( pErrorBlob );
+		SAFE_RELEASE( pBlob );    
+
+		btDX11SIMDAwareSoftBodySolver::KernelDesc descriptor;
+		descriptor.kernel = 0;
+		descriptor.constBuffer = 0;
+		return descriptor;
+	}    
+
+	// Create the Compute Shader
+	hr = m_dx11Device->CreateComputeShader( pBlob->GetBufferPointer(), pBlob->GetBufferSize(), NULL, &kernelPointer );
+	if( FAILED( hr ) )
+	{
+		btDX11SIMDAwareSoftBodySolver::KernelDesc descriptor;
+		descriptor.kernel = 0;
+		descriptor.constBuffer = 0;
+		return descriptor;
+	}
+
+	ID3D11Buffer* constBuffer = 0;
+	if( constBufferSize > 0 )
+	{
+		// Create the constant buffer
+		D3D11_BUFFER_DESC constant_buffer_desc;
+		ZeroMemory(&constant_buffer_desc, sizeof(constant_buffer_desc));
+		constant_buffer_desc.ByteWidth = constBufferSize;
+		constant_buffer_desc.Usage = D3D11_USAGE_DYNAMIC;
+		constant_buffer_desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+		constant_buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
+		m_dx11Device->CreateBuffer(&constant_buffer_desc, NULL, &constBuffer);
+		if( FAILED( hr ) )
+		{
+			KernelDesc descriptor;
+			descriptor.kernel = 0;
+			descriptor.constBuffer = 0;
+			return descriptor;
+		}
+	}
+
+	SAFE_RELEASE( pErrorBlob );
+	SAFE_RELEASE( pBlob );
+
+	btDX11SIMDAwareSoftBodySolver::KernelDesc descriptor;
+	descriptor.kernel = kernelPointer;
+	descriptor.constBuffer = constBuffer;
+	return descriptor;
+} // compileComputeShader
+
+
+bool btDX11SIMDAwareSoftBodySolver::buildShaders()
+{
+	// Ensure current kernels are released first
+	releaseKernels();
+
+	bool returnVal = true;
+
+
+	if( m_shadersInitialized )
+		return true;
+
+	
+	updatePositionsFromVelocitiesKernel = compileComputeShaderFromString( UpdatePositionsFromVelocitiesHLSLString, "UpdatePositionsFromVelocitiesKernel", sizeof(UpdatePositionsFromVelocitiesCB) );
+	if( !updatePositionsFromVelocitiesKernel.constBuffer )
+		returnVal = false;
+	
+	char maxVerticesPerWavefront[20];
+	char maxBatchesPerWavefront[20];
+	char waveFrontSize[20];
+	char waveFrontBlockMultiplier[20];
+	char blockSize[20];
+
+	sprintf(maxVerticesPerWavefront, "%d", m_linkData.getMaxVerticesPerWavefront());
+	sprintf(maxBatchesPerWavefront, "%d", m_linkData.getMaxBatchesPerWavefront());
+	sprintf(waveFrontSize, "%d", m_linkData.getWavefrontSize());	
+	sprintf(waveFrontBlockMultiplier, "%d", WAVEFRONT_BLOCK_MULTIPLIER);
+	sprintf(blockSize, "%d", WAVEFRONT_BLOCK_MULTIPLIER*m_linkData.getWavefrontSize());
+	
+	D3D10_SHADER_MACRO solvePositionsMacros[6] = { "MAX_NUM_VERTICES_PER_WAVE", maxVerticesPerWavefront, "MAX_BATCHES_PER_WAVE", maxBatchesPerWavefront, "WAVEFRONT_SIZE", waveFrontSize, "WAVEFRONT_BLOCK_MULTIPLIER", waveFrontBlockMultiplier, "BLOCK_SIZE", blockSize, 0, 0 };
+
+	solvePositionsFromLinksKernel = compileComputeShaderFromString( SolvePositionsSIMDBatchedHLSLString, "SolvePositionsFromLinksKernel", sizeof(SolvePositionsFromLinksKernelCB), solvePositionsMacros );
+	if( !solvePositionsFromLinksKernel.constBuffer )
+		returnVal = false;
+
+	updateVelocitiesFromPositionsWithVelocitiesKernel = compileComputeShaderFromString( UpdateNodesHLSLString, "updateVelocitiesFromPositionsWithVelocitiesKernel", sizeof(UpdateVelocitiesFromPositionsWithVelocitiesCB) );
+	if( !updateVelocitiesFromPositionsWithVelocitiesKernel.constBuffer )
+		returnVal = false;
+	updateVelocitiesFromPositionsWithoutVelocitiesKernel = compileComputeShaderFromString( UpdatePositionsHLSLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel", sizeof(UpdateVelocitiesFromPositionsWithoutVelocitiesCB));
+	if( !updateVelocitiesFromPositionsWithoutVelocitiesKernel.constBuffer )
+		returnVal = false;
+	integrateKernel = compileComputeShaderFromString( IntegrateHLSLString, "IntegrateKernel", sizeof(IntegrateCB) );
+	if( !integrateKernel.constBuffer )
+		returnVal = false;
+	applyForcesKernel = compileComputeShaderFromString( ApplyForcesHLSLString, "ApplyForcesKernel", sizeof(ApplyForcesCB) );
+	if( !applyForcesKernel.constBuffer )
+		returnVal = false;
+
+	// TODO: Rename to UpdateSoftBodies
+	resetNormalsAndAreasKernel = compileComputeShaderFromString( UpdateNormalsHLSLString, "ResetNormalsAndAreasKernel", sizeof(UpdateSoftBodiesCB) );
+	if( !resetNormalsAndAreasKernel.constBuffer )
+		returnVal = false;
+	normalizeNormalsAndAreasKernel = compileComputeShaderFromString( UpdateNormalsHLSLString, "NormalizeNormalsAndAreasKernel", sizeof(UpdateSoftBodiesCB) );
+	if( !normalizeNormalsAndAreasKernel.constBuffer )
+		returnVal = false;
+	updateSoftBodiesKernel = compileComputeShaderFromString( UpdateNormalsHLSLString, "UpdateSoftBodiesKernel", sizeof(UpdateSoftBodiesCB) );
+	if( !updateSoftBodiesKernel.constBuffer )
+		returnVal = false;
+	outputToVertexArrayWithNormalsKernel = compileComputeShaderFromString( OutputToVertexArrayHLSLString, "OutputToVertexArrayWithNormalsKernel", sizeof(OutputToVertexArrayCB) );
+	if( !outputToVertexArrayWithNormalsKernel.constBuffer )
+		returnVal = false;
+	outputToVertexArrayWithoutNormalsKernel = compileComputeShaderFromString( OutputToVertexArrayHLSLString, "OutputToVertexArrayWithoutNormalsKernel", sizeof(OutputToVertexArrayCB) );
+	if( !outputToVertexArrayWithoutNormalsKernel.constBuffer )
+		returnVal = false;
+
+
+	if( returnVal )
+		m_shadersInitialized = true;
+
+	return returnVal;
+}
+
+
+
+void btDX11SIMDAwareSoftBodySolver::predictMotion( float timeStep )
+{
+	// Fill the force arrays with current acceleration data etc
+	m_perClothWindVelocity.resize( m_softBodySet.size() );
+	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
+	{
+		btSoftBody *softBody = m_softBodySet[softBodyIndex]->getSoftBody();
+		
+		m_perClothWindVelocity[softBodyIndex] = toVector3(softBody->getWindVelocity());
+	}
+	m_dx11PerClothWindVelocity.changedOnCPU();
+
+	// Apply forces that we know about to the cloths
+	applyForces(  timeStep * getTimeScale() );
+
+	// Itegrate motion for all soft bodies dealt with by the solver
+	integrate( timeStep * getTimeScale() );
+	// End prediction work for solvers
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+static void generateBatchesOfWavefronts( btAlignedObjectArray < btAlignedObjectArray <int> > &linksForWavefronts, btSoftBodyLinkData &linkData, int numVertices, btAlignedObjectArray < btAlignedObjectArray <int> > &wavefrontBatches )
+{
+	// A per-batch map of truth values stating whether a given vertex is in that batch
+	// This allows us to significantly optimize the batching
+	btAlignedObjectArray <btAlignedObjectArray<bool> > mapOfVerticesInBatches;
+
+	for( int waveIndex = 0; waveIndex < linksForWavefronts.size(); ++waveIndex )
+	{
+		btAlignedObjectArray <int> &wavefront( linksForWavefronts[waveIndex] );
+
+		int batch = 0;
+		bool placed = false;
+		while( batch < wavefrontBatches.size() && !placed )
+		{
+			// Test the current batch, see if this wave shares any vertex with the waves in the batch
+			bool foundSharedVertex = false;
+			for( int link = 0; link < wavefront.size(); ++link )
+			{
+				btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
+				if( (mapOfVerticesInBatches[batch])[vertices.vertex0] || (mapOfVerticesInBatches[batch])[vertices.vertex1] )
+				{
+					foundSharedVertex = true;
+				}
+			}
+
+			if( !foundSharedVertex )
+			{
+				wavefrontBatches[batch].push_back( waveIndex );	
+				// Insert vertices into this batch too
+				for( int link = 0; link < wavefront.size(); ++link )
+				{
+					btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
+					(mapOfVerticesInBatches[batch])[vertices.vertex0] = true;
+					(mapOfVerticesInBatches[batch])[vertices.vertex1] = true;
+				}
+				placed = true;
+			}
+			batch++;
+		}
+		if( batch == wavefrontBatches.size() && !placed )
+		{
+			wavefrontBatches.resize( batch + 1 );
+			wavefrontBatches[batch].push_back( waveIndex );
+
+			// And resize map as well
+			mapOfVerticesInBatches.resize( batch + 1 );
+			
+			// Resize maps with total number of vertices
+			mapOfVerticesInBatches[batch].resize( numVertices, false );
+
+			// Insert vertices into this batch too
+			for( int link = 0; link < wavefront.size(); ++link )
+			{
+				btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
+				(mapOfVerticesInBatches[batch])[vertices.vertex0] = true;
+				(mapOfVerticesInBatches[batch])[vertices.vertex1] = true;
+			}
+		}
+	}
+	mapOfVerticesInBatches.clear();
+}
+
+// Function to remove an object from a vector maintaining correct ordering of the vector
+template< typename T > static void removeFromVector( btAlignedObjectArray< T > &vectorToUpdate, int indexToRemove )
+{
+	int currentSize = vectorToUpdate.size();
+	for( int i = indexToRemove; i < (currentSize-1); ++i )
+	{
+		vectorToUpdate[i] = vectorToUpdate[i+1];
+	}
+	if( currentSize > 0 )
+		vectorToUpdate.resize( currentSize - 1 );
+}
+
+/**
+ * Insert element into vectorToUpdate at index index.
+ */
+template< typename T > static void insertAtIndex( btAlignedObjectArray< T > &vectorToUpdate, int index, T element )
+{
+	vectorToUpdate.resize( vectorToUpdate.size() + 1 );
+	for( int i = (vectorToUpdate.size() - 1); i > index; --i )
+	{
+		vectorToUpdate[i] = vectorToUpdate[i-1];
+	}
+	vectorToUpdate[index] = element;
+}
+
+/** 
+ * Insert into btAlignedObjectArray assuming the array is ordered and maintaining both ordering and uniqueness.
+ * ie it treats vectorToUpdate as an ordered set.
+ */
+template< typename T > static void insertUniqueAndOrderedIntoVector( btAlignedObjectArray<T> &vectorToUpdate, T element )
+{
+	int index = 0;
+	while( index < vectorToUpdate.size() && vectorToUpdate[index] < element )
+	{
+		index++;
+	}
+	if( index == vectorToUpdate.size() || vectorToUpdate[index] != element )
+		insertAtIndex( vectorToUpdate, index, element );
+}
+
+// Experimental batch generation that we could use in the simulations
+// Attempts to generate larger batches that work on a per-wavefront basis
+void generateLinksPerVertex( int numVertices, btSoftBodyLinkData &linkData, btAlignedObjectArray< int > &listOfLinksPerVertex, btAlignedObjectArray <int> &numLinksPerVertex, int &maxLinks )
+{
+	for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
+	{
+		btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
+		numLinksPerVertex[nodes.vertex0]++;
+		numLinksPerVertex[nodes.vertex1]++;
+	}
+	int maxLinksPerVertex = 0;
+	for( int vertexIndex = 0; vertexIndex < numVertices; ++vertexIndex )
+	{
+		maxLinksPerVertex = btMax(numLinksPerVertex[vertexIndex], maxLinksPerVertex);
+	}
+	maxLinks = maxLinksPerVertex;
+
+	btAlignedObjectArray< int > linksFoundPerVertex;
+	linksFoundPerVertex.resize( numVertices, 0 );
+
+	listOfLinksPerVertex.resize( maxLinksPerVertex * numVertices );
+
+	for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
+	{
+		btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
+		{
+			// Do vertex 0
+			int vertexIndex = nodes.vertex0;
+			int linkForVertex = linksFoundPerVertex[nodes.vertex0];
+			int linkAddress = vertexIndex * maxLinksPerVertex + linkForVertex;
+
+			listOfLinksPerVertex[linkAddress] = linkIndex;
+
+			linksFoundPerVertex[nodes.vertex0] = linkForVertex + 1;
+		}
+		{
+			// Do vertex 1
+			int vertexIndex = nodes.vertex1;
+			int linkForVertex = linksFoundPerVertex[nodes.vertex1];
+			int linkAddress = vertexIndex * maxLinksPerVertex + linkForVertex;
+
+			listOfLinksPerVertex[linkAddress] = linkIndex;
+
+			linksFoundPerVertex[nodes.vertex1] = linkForVertex + 1;
+		}
+	}
+}
+
+static void computeBatchingIntoWavefronts( 
+	btSoftBodyLinkData &linkData, 
+	int wavefrontSize, 
+	int linksPerWorkItem, 
+	int maxLinksPerWavefront, 
+	btAlignedObjectArray < btAlignedObjectArray <int> > &linksForWavefronts, 
+	btAlignedObjectArray< btAlignedObjectArray < btAlignedObjectArray <int> > > &batchesWithinWaves, /* wave, batch, links in batch */
+	btAlignedObjectArray< btAlignedObjectArray< int > > &verticesForWavefronts /* wavefront, vertex */
+	)
+{
+	
+
+	// Attempt generation of larger batches of links.
+	btAlignedObjectArray< bool > processedLink;
+	processedLink.resize( linkData.getNumLinks() );
+	btAlignedObjectArray< int > listOfLinksPerVertex;
+	int maxLinksPerVertex = 0;
+
+	// Count num vertices
+	int numVertices = 0;
+	for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
+	{
+		btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
+		numVertices = btMax( numVertices, nodes.vertex0 + 1 );
+		numVertices = btMax( numVertices, nodes.vertex1 + 1 );
+	}
+
+	// Need list of links per vertex
+	// Compute valence of each vertex
+	btAlignedObjectArray <int> numLinksPerVertex;
+	numLinksPerVertex.resize(0);
+	numLinksPerVertex.resize( numVertices, 0 );
+
+	generateLinksPerVertex( numVertices, linkData, listOfLinksPerVertex, numLinksPerVertex, maxLinksPerVertex );
+
+	for( int vertex = 0; vertex < 10; ++vertex )
+	{
+		for( int link = 0; link < numLinksPerVertex[vertex]; ++link )
+		{
+			int linkAddress = vertex * maxLinksPerVertex + link;
+		}
+	}
+
+
+	// At this point we know what links we have for each vertex so we can start batching
+	
+	// We want a vertex to start with, let's go with 0
+	int currentVertex = 0;
+	int linksProcessed = 0;
+
+	btAlignedObjectArray <int> verticesToProcess;
+
+	while( linksProcessed < linkData.getNumLinks() )
+	{
+		// Next wavefront
+		int nextWavefront = linksForWavefronts.size();
+		linksForWavefronts.resize( nextWavefront + 1 );
+		btAlignedObjectArray <int> &linksForWavefront(linksForWavefronts[nextWavefront]);
+		verticesForWavefronts.resize( nextWavefront + 1 );
+		btAlignedObjectArray<int> &vertexSet( verticesForWavefronts[nextWavefront] );
+
+		linksForWavefront.resize(0);
+
+		// Loop to find enough links to fill the wavefront
+		// Stopping if we either run out of links, or fill it
+		while( linksProcessed < linkData.getNumLinks() && linksForWavefront.size() < maxLinksPerWavefront )
+		{
+			// Go through the links for the current vertex
+			for( int link = 0; link < numLinksPerVertex[currentVertex] && linksForWavefront.size() < maxLinksPerWavefront; ++link )
+			{
+				int linkAddress = currentVertex * maxLinksPerVertex + link;
+				int linkIndex = listOfLinksPerVertex[linkAddress];
+				
+				// If we have not already processed this link, add it to the wavefront
+				// Claim it as another processed link
+				// Add the vertex at the far end to the list of vertices to process.
+				if( !processedLink[linkIndex] )
+				{
+					linksForWavefront.push_back( linkIndex );
+					linksProcessed++;
+					processedLink[linkIndex] = true;
+					int v0 = linkData.getVertexPair(linkIndex).vertex0;
+					int v1 = linkData.getVertexPair(linkIndex).vertex1;
+					if( v0 == currentVertex )
+						verticesToProcess.push_back( v1 );
+					else
+						verticesToProcess.push_back( v0 );
+				}
+			}
+			if( verticesToProcess.size() > 0 )
+			{
+				// Get the element on the front of the queue and remove it
+				currentVertex = verticesToProcess[0];
+				removeFromVector( verticesToProcess, 0 );
+			} else {		
+				// If we've not yet processed all the links, find the first unprocessed one
+				// and select one of its vertices as the current vertex
+				if( linksProcessed < linkData.getNumLinks() )
+				{
+					int searchLink = 0;
+					while( processedLink[searchLink] )
+						searchLink++;
+					currentVertex = linkData.getVertexPair(searchLink).vertex0;
+				}	
+			}
+		}
+
+		// We have either finished or filled a wavefront
+		for( int link = 0; link < linksForWavefront.size(); ++link )
+		{
+			int v0 = linkData.getVertexPair( linksForWavefront[link] ).vertex0;
+			int v1 = linkData.getVertexPair( linksForWavefront[link] ).vertex1;
+			insertUniqueAndOrderedIntoVector( vertexSet, v0 );
+			insertUniqueAndOrderedIntoVector( vertexSet, v1 );
+		}
+		// Iterate over links mapped to the wave and batch those
+		// We can run a batch on each cycle trivially
+		
+		batchesWithinWaves.resize( batchesWithinWaves.size() + 1 );
+		btAlignedObjectArray < btAlignedObjectArray <int> > &batchesWithinWave( batchesWithinWaves[batchesWithinWaves.size()-1] );
+		
+
+		for( int link = 0; link < linksForWavefront.size(); ++link )
+		{
+			int linkIndex = linksForWavefront[link];
+			btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( linkIndex );
+			
+			int batch = 0;
+			bool placed = false;
+			while( batch < batchesWithinWave.size() && !placed )
+			{
+				bool foundSharedVertex = false;
+				if( batchesWithinWave[batch].size() >= wavefrontSize )
+				{
+					// If we have already filled this batch, move on to another
+					foundSharedVertex = true;
+				} else {
+					for( int link2 = 0; link2 < batchesWithinWave[batch].size(); ++link2 )
+					{
+						btSoftBodyLinkData::LinkNodePair vertices2 = linkData.getVertexPair( (batchesWithinWave[batch])[link2] );
+
+						if( vertices.vertex0 == vertices2.vertex0 ||
+							vertices.vertex1 == vertices2.vertex0 ||
+							vertices.vertex0 == vertices2.vertex1 ||
+							vertices.vertex1 == vertices2.vertex1 )
+						{
+							foundSharedVertex = true;
+							break;
+						}
+					}
+				}
+				if( !foundSharedVertex )
+				{
+					batchesWithinWave[batch].push_back( linkIndex );
+					placed = true;
+				} else {
+					++batch;
+				}
+			}
+			if( batch == batchesWithinWave.size() && !placed )
+			{
+				batchesWithinWave.resize( batch + 1 );
+				batchesWithinWave[batch].push_back( linkIndex );
+			}
+		}
+		
+	}
+
+}
+
+void btSoftBodyLinkDataDX11SIMDAware::generateBatches()
+{
+	btAlignedObjectArray < btAlignedObjectArray <int> > linksForWavefronts;
+	btAlignedObjectArray < btAlignedObjectArray <int> > wavefrontBatches;
+	btAlignedObjectArray< btAlignedObjectArray < btAlignedObjectArray <int> > > batchesWithinWaves;
+	btAlignedObjectArray< btAlignedObjectArray< int > > verticesForWavefronts; // wavefronts, vertices in wavefront as an ordered set
+
+	// Group the links into wavefronts
+	computeBatchingIntoWavefronts( *this, m_wavefrontSize, m_linksPerWorkItem, m_maxLinksPerWavefront, linksForWavefronts, batchesWithinWaves, verticesForWavefronts );
+
+
+	// Batch the wavefronts
+	generateBatchesOfWavefronts( linksForWavefronts, *this, m_maxVertex, wavefrontBatches );
+
+	m_numWavefronts = linksForWavefronts.size();
+
+	// At this point we have a description of which links we need to process in each wavefront
+
+	// First correctly fill the batch ranges vector
+	int numBatches = wavefrontBatches.size();
+	m_wavefrontBatchStartLengths.resize(0);
+	int prefixSum = 0;
+	for( int batchIndex = 0; batchIndex < numBatches; ++batchIndex )
+	{
+		int wavesInBatch = wavefrontBatches[batchIndex].size();
+		int nextPrefixSum = prefixSum + wavesInBatch;
+		m_wavefrontBatchStartLengths.push_back( BatchPair( prefixSum, nextPrefixSum - prefixSum ) );
+
+		prefixSum += wavesInBatch;
+	}
+	
+	// Also find max number of batches within a wave
+	m_maxBatchesWithinWave = 0;
+	m_maxVerticesWithinWave = 0;
+	m_numBatchesAndVerticesWithinWaves.resize( m_numWavefronts );
+	for( int waveIndex = 0; waveIndex < m_numWavefronts; ++waveIndex )
+	{
+		// See if the number of batches in this wave is greater than the current maxium
+		int batchesInCurrentWave = batchesWithinWaves[waveIndex].size();
+		int verticesInCurrentWave = verticesForWavefronts[waveIndex].size();
+		m_maxBatchesWithinWave = btMax( batchesInCurrentWave, m_maxBatchesWithinWave );
+		m_maxVerticesWithinWave = btMax( verticesInCurrentWave, m_maxVerticesWithinWave );
+	}
+	
+	// Add padding values both for alignment and as dudd addresses within LDS to compute junk rather than branch around
+	m_maxVerticesWithinWave = 16*((m_maxVerticesWithinWave/16)+2);
+
+	// Now we know the maximum number of vertices per-wave we can resize the global vertices array
+	m_wavefrontVerticesGlobalAddresses.resize( m_maxVerticesWithinWave * m_numWavefronts );
+
+	// Grab backup copies of all the link data arrays for the sorting process
+	btAlignedObjectArray<btSoftBodyLinkData::LinkNodePair>				m_links_Backup(m_links);
+	btAlignedObjectArray<float>											m_linkStrength_Backup(m_linkStrength);
+	btAlignedObjectArray<float>											m_linksMassLSC_Backup(m_linksMassLSC);
+	btAlignedObjectArray<float>											m_linksRestLengthSquared_Backup(m_linksRestLengthSquared);
+	//btAlignedObjectArray<Vectormath::Aos::Vector3>						m_linksCLength_Backup(m_linksCLength);
+	//btAlignedObjectArray<float>											m_linksLengthRatio_Backup(m_linksLengthRatio);
+	btAlignedObjectArray<float>											m_linksRestLength_Backup(m_linksRestLength);
+	btAlignedObjectArray<float>											m_linksMaterialLinearStiffnessCoefficient_Backup(m_linksMaterialLinearStiffnessCoefficient);
+
+	// Resize to a wavefront sized batch per batch per wave so we get perfectly coherent memory accesses.
+	m_links.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
+	m_linkVerticesLocalAddresses.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
+	m_linkStrength.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
+	m_linksMassLSC.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
+	m_linksRestLengthSquared.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
+	m_linksRestLength.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
+	m_linksMaterialLinearStiffnessCoefficient.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );	
+		
+	// Then re-order links into wavefront blocks
+
+	// Total number of wavefronts moved. This will decide the ordering of sorted wavefronts.
+	int wavefrontCount = 0;
+
+	// Iterate over batches of wavefronts, then wavefronts in the batch
+	for( int batchIndex = 0; batchIndex < numBatches; ++batchIndex )
+	{
+		btAlignedObjectArray <int> &batch( wavefrontBatches[batchIndex] );
+		int wavefrontsInBatch = batch.size();
+
+		
+		for( int wavefrontIndex = 0; wavefrontIndex < wavefrontsInBatch; ++wavefrontIndex )
+		{	
+
+			int originalWavefrontIndex = batch[wavefrontIndex];
+			btAlignedObjectArray< int > &wavefrontVertices( verticesForWavefronts[originalWavefrontIndex] );
+			int verticesUsedByWavefront = wavefrontVertices.size();
+
+			// Copy the set of vertices into the correctly structured array for use on the device
+			// Fill the non-vertices with -1s
+			// so we can mask out those reads
+			for( int vertex = 0; vertex < verticesUsedByWavefront; ++vertex )
+			{
+				m_wavefrontVerticesGlobalAddresses[m_maxVerticesWithinWave * wavefrontCount + vertex] = wavefrontVertices[vertex];
+			}
+			for( int vertex = verticesUsedByWavefront; vertex < m_maxVerticesWithinWave; ++vertex )
+			{
+				m_wavefrontVerticesGlobalAddresses[m_maxVerticesWithinWave * wavefrontCount + vertex] = -1;
+			}
+
+			// Obtain the set of batches within the current wavefront
+			btAlignedObjectArray < btAlignedObjectArray <int> > &batchesWithinWavefront( batchesWithinWaves[originalWavefrontIndex] );
+			// Set the size of the batches for use in the solver, correctly ordered
+			NumBatchesVerticesPair batchesAndVertices;
+			batchesAndVertices.numBatches = batchesWithinWavefront.size();
+			batchesAndVertices.numVertices = verticesUsedByWavefront;
+			m_numBatchesAndVerticesWithinWaves[wavefrontCount] = batchesAndVertices;
+			
+
+			// Now iterate over batches within the wavefront to structure the links correctly
+			for( int wavefrontBatch = 0; wavefrontBatch < batchesWithinWavefront.size(); ++wavefrontBatch )
+			{
+				btAlignedObjectArray <int> &linksInBatch( batchesWithinWavefront[wavefrontBatch] );
+				int wavefrontBatchSize = linksInBatch.size();
+
+				int batchAddressInTarget = m_maxBatchesWithinWave * m_wavefrontSize * wavefrontCount + m_wavefrontSize * wavefrontBatch;
+
+				for( int linkIndex = 0; linkIndex < wavefrontBatchSize; ++linkIndex )
+				{
+					int originalLinkAddress = linksInBatch[linkIndex];
+					// Reorder simple arrays trivially
+					m_links[batchAddressInTarget + linkIndex] = m_links_Backup[originalLinkAddress];
+					m_linkStrength[batchAddressInTarget + linkIndex] = m_linkStrength_Backup[originalLinkAddress];
+					m_linksMassLSC[batchAddressInTarget + linkIndex] = m_linksMassLSC_Backup[originalLinkAddress];
+					m_linksRestLengthSquared[batchAddressInTarget + linkIndex] = m_linksRestLengthSquared_Backup[originalLinkAddress];
+					m_linksRestLength[batchAddressInTarget + linkIndex] = m_linksRestLength_Backup[originalLinkAddress];
+					m_linksMaterialLinearStiffnessCoefficient[batchAddressInTarget + linkIndex] = m_linksMaterialLinearStiffnessCoefficient_Backup[originalLinkAddress];
+
+					// The local address is more complicated. We need to work out where a given vertex will end up
+					// by searching the set of vertices for this link and using the index as the local address
+					btSoftBodyLinkData::LinkNodePair localPair;
+					btSoftBodyLinkData::LinkNodePair globalPair = m_links[batchAddressInTarget + linkIndex];
+					localPair.vertex0 = wavefrontVertices.findLinearSearch( globalPair.vertex0 );
+					localPair.vertex1 = wavefrontVertices.findLinearSearch( globalPair.vertex1 );
+					m_linkVerticesLocalAddresses[batchAddressInTarget + linkIndex] = localPair;
+				}
+				for( int linkIndex = wavefrontBatchSize; linkIndex < m_wavefrontSize; ++linkIndex )
+				{
+					// Put 0s into these arrays for padding for cleanliness
+					m_links[batchAddressInTarget + linkIndex] = btSoftBodyLinkData::LinkNodePair(0, 0);
+					m_linkStrength[batchAddressInTarget + linkIndex] = 0.f;
+					m_linksMassLSC[batchAddressInTarget + linkIndex] = 0.f;
+					m_linksRestLengthSquared[batchAddressInTarget + linkIndex] = 0.f;
+					m_linksRestLength[batchAddressInTarget + linkIndex] = 0.f;
+					m_linksMaterialLinearStiffnessCoefficient[batchAddressInTarget + linkIndex] = 0.f;
+
+
+					// For local addresses of junk data choose a set of addresses just above the range of valid ones 
+					// and cycling tyhrough % 16 so that we don't have bank conficts between all dud addresses
+					// The valid addresses will do scatter and gather in the valid range, the junk ones should happily work
+					// off the end of that range so we need no control
+					btSoftBodyLinkData::LinkNodePair localPair;
+					localPair.vertex0 = verticesUsedByWavefront + (linkIndex % 16);
+					localPair.vertex1 = verticesUsedByWavefront + (linkIndex % 16);
+					m_linkVerticesLocalAddresses[batchAddressInTarget + linkIndex] = localPair;
+				}
+
+			}
+
+			
+			wavefrontCount++;
+		}
+
+	
+	}
+
+} // void btSoftBodyLinkDataDX11SIMDAware::generateBatches()
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.h
new file mode 100644
index 000000000..ceac535e2
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.h
@@ -0,0 +1,432 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "vectormath/vmInclude.h"
+#include "BulletSoftBody/btSoftBodySolvers.h"
+#include "btSoftBodySolverVertexBuffer_DX11.h"
+#include "btSoftBodySolverLinkData_DX11SIMDAware.h"
+#include "btSoftBodySolverVertexData_DX11.h"
+#include "btSoftBodySolverTriangleData_DX11.h"
+
+
+#ifndef BT_SOFT_BODY_DX11_SOLVER_SIMDAWARE_H
+#define BT_SOFT_BODY_DX11_SOLVER_SIMDAWARE_H
+
+class btDX11SIMDAwareSoftBodySolver : public btSoftBodySolver
+{
+public:
+
+		/**
+	 * SoftBody class to maintain information about a soft body instance
+	 * within a solver.
+	 * This data addresses the main solver arrays.
+	 */
+	class btAcceleratedSoftBodyInterface
+	{
+	protected:
+		/** Current number of vertices that are part of this cloth */
+		int m_numVertices;
+		/** Maximum number of vertices allocated to be part of this cloth */
+		int m_maxVertices;
+		/** Current number of triangles that are part of this cloth */
+		int m_numTriangles;
+		/** Maximum number of triangles allocated to be part of this cloth */
+		int m_maxTriangles;
+		/** Index of first vertex in the world allocated to this cloth */
+		int m_firstVertex;
+		/** Index of first triangle in the world allocated to this cloth */
+		int m_firstTriangle;
+		/** Index of first link in the world allocated to this cloth */
+		int m_firstLink;
+		/** Maximum number of links allocated to this cloth */
+		int m_maxLinks;
+		/** Current number of links allocated to this cloth */
+		int m_numLinks;
+
+		/** The actual soft body this data represents */
+		btSoftBody *m_softBody;
+
+
+	public:
+		btAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
+		  m_softBody( softBody )
+		{
+			m_numVertices = 0;
+			m_maxVertices = 0;
+			m_numTriangles = 0;
+			m_maxTriangles = 0;
+			m_firstVertex = 0;
+			m_firstTriangle = 0;
+			m_firstLink = 0;
+			m_maxLinks = 0;
+			m_numLinks = 0;
+		}
+		int getNumVertices()
+		{
+			return m_numVertices;
+		}
+
+		int getNumTriangles()
+		{
+			return m_numTriangles;
+		}
+
+		int getMaxVertices()
+		{
+			return m_maxVertices;
+		}
+
+		int getMaxTriangles()
+		{
+			return m_maxTriangles;
+		}
+
+		int getFirstVertex()
+		{
+			return m_firstVertex;
+		}
+
+		int getFirstTriangle()
+		{
+			return m_firstTriangle;
+		}
+
+
+		void setNumVertices( int numVertices )
+		{
+			m_numVertices = numVertices;
+		}	
+		
+		void setNumTriangles( int numTriangles )
+		{
+			m_numTriangles = numTriangles;
+		}
+
+		void setMaxVertices( int maxVertices )
+		{
+			m_maxVertices = maxVertices;
+		}
+
+		void setMaxTriangles( int maxTriangles )
+		{
+			m_maxTriangles = maxTriangles;
+		}
+
+		void setFirstVertex( int firstVertex )
+		{
+			m_firstVertex = firstVertex;
+		}
+
+		void setFirstTriangle( int firstTriangle )
+		{
+			m_firstTriangle = firstTriangle;
+		}
+
+		void setMaxLinks( int maxLinks )
+		{
+			m_maxLinks = maxLinks;
+		}
+
+		void setNumLinks( int numLinks )
+		{
+			m_numLinks = numLinks;
+		}
+
+		void setFirstLink( int firstLink )
+		{
+			m_firstLink = firstLink;
+		}
+
+		int getMaxLinks()
+		{
+			return m_maxLinks;
+		}
+
+		int getNumLinks()
+		{
+			return m_numLinks;
+		}
+
+		int getFirstLink()
+		{
+			return m_firstLink;
+		}
+
+		btSoftBody* getSoftBody()
+		{
+			return m_softBody;
+		}
+
+	};
+
+
+	class KernelDesc
+	{
+	protected:
+		
+
+	public:
+		ID3D11ComputeShader* kernel;
+		ID3D11Buffer* constBuffer;
+
+		KernelDesc()
+		{
+			kernel = 0;
+			constBuffer = 0;
+		}
+
+		virtual ~KernelDesc()
+		{
+			// TODO: this should probably destroy its kernel but we need to be careful
+			// in case KernelDescs are copied
+		}
+	}; 
+
+	struct SolvePositionsFromLinksKernelCB
+	{		
+		int startWave;
+		int numWaves;
+		float kst;
+		float ti;
+	};
+
+	struct IntegrateCB
+	{
+		int numNodes;
+		float solverdt;
+		int padding1;
+		int padding2;
+	};
+
+	struct UpdatePositionsFromVelocitiesCB
+	{
+		int numNodes;
+		float solverSDT;
+		int padding1;
+		int padding2;
+	};
+
+	struct UpdateVelocitiesFromPositionsWithoutVelocitiesCB
+	{
+		int numNodes;
+		float isolverdt;
+		int padding1;
+		int padding2;
+	};
+
+	struct UpdateVelocitiesFromPositionsWithVelocitiesCB
+	{
+		int numNodes;
+		float isolverdt;
+		int padding1;
+		int padding2;
+	};
+
+	struct UpdateSoftBodiesCB
+	{
+		int numNodes;
+		int startFace;
+		int numFaces;
+		float epsilon;
+	};
+
+
+	struct OutputToVertexArrayCB
+	{
+		int startNode;
+		int numNodes;
+		int positionOffset;
+		int positionStride;
+		
+		int normalOffset;	
+		int normalStride;
+		int padding1;
+		int padding2;
+	};
+
+
+	struct ApplyForcesCB
+	{
+		unsigned int numNodes;
+		float solverdt;
+		float epsilon;
+		int padding3;
+	};
+
+	struct AddVelocityCB
+	{
+		int startNode;
+		int lastNode;
+		float velocityX;
+		float velocityY;
+		float velocityZ;
+		int padding1;
+		int padding2;
+		int padding3;
+	};
+
+
+private:
+	ID3D11Device *		 m_dx11Device;
+	ID3D11DeviceContext* m_dx11Context;
+
+
+	/** Link data for all cloths. Note that this will be sorted batch-wise for efficient computation and m_linkAddresses will maintain the addressing. */
+	btSoftBodyLinkDataDX11SIMDAware m_linkData;
+	btSoftBodyVertexDataDX11 m_vertexData;
+	btSoftBodyTriangleDataDX11 m_triangleData;
+		
+	/** Variable to define whether we need to update solver constants on the next iteration */
+	bool m_updateSolverConstants;
+
+	bool m_shadersInitialized;
+	
+	/** 
+	 * Cloths owned by this solver.
+	 * Only our cloths are in this array.
+	 */
+	btAlignedObjectArray< btAcceleratedSoftBodyInterface * > m_softBodySet;
+
+	/** Acceleration value to be applied to all non-static vertices in the solver. 
+	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
+	 */
+	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_perClothAcceleration;
+	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11PerClothAcceleration;
+
+	/** Wind velocity to be applied normal to all non-static vertices in the solver. 
+	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
+	 */
+	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_perClothWindVelocity;
+	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11PerClothWindVelocity;
+
+	/** Velocity damping factor */
+	btAlignedObjectArray< float >						m_perClothDampingFactor;
+	btDX11Buffer<float>									m_dx11PerClothDampingFactor;
+
+	/** Velocity correction coefficient */
+	btAlignedObjectArray< float >						m_perClothVelocityCorrectionCoefficient;
+	btDX11Buffer<float>									m_dx11PerClothVelocityCorrectionCoefficient;
+
+	/** Lift parameter for wind effect on cloth. */
+	btAlignedObjectArray< float >						m_perClothLiftFactor;
+	btDX11Buffer<float>									m_dx11PerClothLiftFactor;
+	
+	/** Drag parameter for wind effect on cloth. */
+	btAlignedObjectArray< float >						m_perClothDragFactor;
+	btDX11Buffer<float>									m_dx11PerClothDragFactor;
+
+	/** Density of the medium in which each cloth sits */
+	btAlignedObjectArray< float >						m_perClothMediumDensity;
+	btDX11Buffer<float>									m_dx11PerClothMediumDensity;
+
+	KernelDesc		solvePositionsFromLinksKernel;
+	KernelDesc		integrateKernel;
+	KernelDesc		addVelocityKernel;
+	KernelDesc		updatePositionsFromVelocitiesKernel;
+	KernelDesc		updateVelocitiesFromPositionsWithoutVelocitiesKernel;
+	KernelDesc		updateVelocitiesFromPositionsWithVelocitiesKernel;
+	KernelDesc		resetNormalsAndAreasKernel;
+	KernelDesc		normalizeNormalsAndAreasKernel;
+	KernelDesc		updateSoftBodiesKernel;
+	KernelDesc		outputToVertexArrayWithNormalsKernel;
+	KernelDesc		outputToVertexArrayWithoutNormalsKernel;
+
+	KernelDesc		outputToVertexArrayKernel;
+	KernelDesc		applyForcesKernel;
+	KernelDesc		collideSphereKernel;
+	KernelDesc		collideCylinderKernel;
+
+
+	
+	/**
+	 * Integrate motion on the solver.
+	 */
+	virtual void integrate( float solverdt );
+	float computeTriangleArea( 
+		const Vectormath::Aos::Point3 &vertex0,
+		const Vectormath::Aos::Point3 &vertex1,
+		const Vectormath::Aos::Point3 &vertex2 );
+
+
+	/**
+	 * Compile a compute shader kernel from a string and return the appropriate KernelDesc object.
+	 */
+	KernelDesc compileComputeShaderFromString( const char* shaderString, const char* shaderName, int constBufferSize, D3D10_SHADER_MACRO *compileMacros = 0 );
+
+	bool buildShaders();
+
+	void resetNormalsAndAreas( int numVertices );
+
+	void normalizeNormalsAndAreas( int numVertices );
+
+	void executeUpdateSoftBodies( int firstTriangle, int numTriangles );
+
+	Vectormath::Aos::Vector3 ProjectOnAxis( const Vectormath::Aos::Vector3 &v, const Vectormath::Aos::Vector3 &a );
+
+	void ApplyClampedForce( float solverdt, const Vectormath::Aos::Vector3 &force, const Vectormath::Aos::Vector3 &vertexVelocity, float inverseMass, Vectormath::Aos::Vector3 &vertexForce );
+
+	virtual void applyForces( float solverdt );
+	
+	void updateConstants( float timeStep );
+
+	btAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
+
+	//////////////////////////////////////
+	// Kernel dispatches
+	void prepareLinks();
+
+	void updatePositionsFromVelocities( float solverdt );
+	void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
+	void solveLinksForVelocity( int startLink, int numLinks, float kst );
+	
+	void updateVelocitiesFromPositionsWithVelocities( float isolverdt );
+	void updateVelocitiesFromPositionsWithoutVelocities( float isolverdt );
+
+	// End kernel dispatches
+	/////////////////////////////////////
+
+	void releaseKernels();
+
+
+public:
+	btDX11SIMDAwareSoftBodySolver(ID3D11Device * dx11Device, ID3D11DeviceContext* dx11Context);
+
+	virtual ~btDX11SIMDAwareSoftBodySolver();
+	
+
+
+	virtual btSoftBodyLinkData &getLinkData();
+
+	virtual btSoftBodyVertexData &getVertexData();
+
+	virtual btSoftBodyTriangleData &getTriangleData();
+
+
+
+	virtual bool checkInitialized();
+
+	virtual void updateSoftBodies( );
+
+	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies );
+
+	virtual void solveConstraints( float solverdt );
+
+	virtual void predictMotion( float solverdt );
+
+	virtual void copySoftBodyToVertexBuffer( const btSoftBody *const softBody, btVertexBufferDescriptor *vertexBuffer );
+};
+
+#endif // #ifndef BT_SOFT_BODY_DX11_SOLVER_SIMDAWARE_H
+
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt
new file mode 100644
index 000000000..d2ef78f69
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt
@@ -0,0 +1,82 @@
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+ADD_DEFINITIONS(-DUSE_AMD_OPENCL)
+ADD_DEFINITIONS(-DCL_PLATFORM_AMD)
+
+
+IF (INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+	INCLUDE_DIRECTORIES(		$ENV{==ATISTREAMSDKROOT=}/include )
+ELSE()
+	INCLUDE_DIRECTORIES(		$ENV{ATISTREAMSDKROOT}/include	)
+ENDIF()
+
+
+
+SET(BulletSoftBodyOpenCLSolvers_SRCS
+	../btSoftBodySolver_OpenCL.cpp
+)
+
+SET(BulletSoftBodyOpenCLSolvers_HDRS
+	../btSoftBodySolver_OpenCL.h
+	../../CPU/btSoftBodySolverData.h
+	../btSoftBodySolverVertexData_OpenCL.h
+	../btSoftBodySolverTriangleData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverBuffer_OpenCL.h
+)
+
+# OpenCL and HLSL Shaders.
+# Build rules generated to stringify these into headers
+# which are needed by some of the sources
+SET(BulletSoftBodyOpenCLSolvers_Shaders
+#	OutputToVertexArray
+	UpdateNormals
+	Integrate
+	UpdatePositions
+	UpdateNodes
+	SolvePositions
+	UpdatePositionsFromVelocities
+	ApplyForces
+	PrepareLinks
+	VSolveLinks
+)
+
+foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC/${f}.cl")
+endforeach(f) 
+
+
+
+ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_AMD
+	${BulletSoftBodyOpenCLSolvers_SRCS} 
+	${BulletSoftBodyOpenCLSolvers_HDRS} 
+	${BulletSoftBodyOpenCLSolvers_OpenCLC}
+)
+
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES SOVERSION ${BULLET_VERSION})
+IF (BUILD_SHARED_LIBS)
+	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
+ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_AMD DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_AMD DESTINATION lib${LIB_SUFFIX})
+				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Apple/CMakeLists.txt b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Apple/CMakeLists.txt
new file mode 100644
index 000000000..65bbea43a
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Apple/CMakeLists.txt
@@ -0,0 +1,73 @@
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+
+
+
+SET(BulletSoftBodyOpenCLSolvers_SRCS
+	../btSoftBodySolver_OpenCL.cpp
+)
+
+SET(BulletSoftBodyOpenCLSolvers_HDRS
+	../btSoftBodySolver_OpenCL.h
+	../../CPU/btSoftBodySolverData.h
+	../btSoftBodySolverVertexData_OpenCL.h
+	../btSoftBodySolverTriangleData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverBuffer_OpenCL.h
+)
+
+# OpenCL and HLSL Shaders.
+# Build rules generated to stringify these into headers
+# which are needed by some of the sources
+SET(BulletSoftBodyOpenCLSolvers_Shaders
+#	OutputToVertexArray
+	UpdateNormals
+	Integrate
+	UpdatePositions
+	UpdateNodes
+	SolvePositions
+	UpdatePositionsFromVelocities
+	ApplyForces
+	PrepareLinks
+	VSolveLinks
+)
+
+foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC10/${f}.cl")
+endforeach(f) 
+
+
+
+ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_Apple
+	${BulletSoftBodyOpenCLSolvers_SRCS} 
+	${BulletSoftBodyOpenCLSolvers_HDRS} 
+	${BulletSoftBodyOpenCLSolvers_OpenCLC}
+)
+
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES SOVERSION ${BULLET_VERSION})
+IF (BUILD_SHARED_LIBS)
+	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
+ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Apple  DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Apple  DESTINATION lib${LIB_SUFFIX})
+				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/CMakeLists.txt b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/CMakeLists.txt
index 0c63b945a..36b173cf8 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/CMakeLists.txt
@@ -1,71 +1,16 @@
 
-INCLUDE_DIRECTORIES(
-${BULLET_PHYSICS_SOURCE_DIR}/src
-)
+IF(BUILD_MINICL_OPENCL_DEMOS)
+	SUBDIRS( MiniCL  )
+ENDIF()
 
+IF(BUILD_AMD_OPENCL_DEMOS)
+	SUBDIRS(AMD)
+ENDIF()
 
-SET(OPENCL_DIR $ENV{ATISTREAMSDKROOT})
-SET(OPENCL_INCLUDE_PATH "${ATISTREAMSDKROOT}/include" CACHE DOCSTRING "OpenCL SDK include path")
+IF(BUILD_NVIDIA_OPENCL_DEMOS)
+	SUBDIRS(NVidia)
+ENDIF()
 
-INCLUDE_DIRECTORIES(${OPENCL_INCLUDE_PATH} "../cpu/")
-
-SET(BulletSoftBodyOpenCLSolvers_SRCS
-	btSoftBodySolver_OpenCL.cpp
-)
-
-SET(BulletSoftBodyOpenCLSolvers_HDRS
-	btSoftBodySolver_OpenCL.h
-	../cpu/btSoftBodySolverData.h
-	btSoftBodySolverVertexData_OpenCL.h
-	btSoftBodySolverTriangleData_OpenCL.h
-	btSoftBodySolverLinkData_OpenCL.h
-	btSoftBodySolverBuffer_OpenCL.h
-)
-
-# OpenCL and HLSL Shaders.
-# Build rules generated to stringify these into headers
-# which are needed by some of the sources
-SET(BulletSoftBodyOpenCLSolvers_Shaders
-#	OutputToVertexArray
-	UpdateNormals
-	Integrate
-	UpdatePositions
-	UpdateNodes
-	SolvePositions
-	UpdatePositionsFromVelocities
-	ApplyForces
-	PrepareLinks
-	VSolveLinks
-)
-
-foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
-    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "OpenCLC/${f}.cl")
-endforeach(f) 
-
-
-
-ADD_LIBRARY(BulletSoftBodySolvers_OpenCL  ${BulletSoftBodyOpenCLSolvers_SRCS} ${BulletSoftBodyOpenCLSolvers_HDRS} ${BulletSoftBodyOpenCLSolvers_OpenCLC})
-SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL PROPERTIES VERSION ${BULLET_VERSION})
-SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL PROPERTIES SOVERSION ${BULLET_VERSION})
-IF (BUILD_SHARED_LIBS)
-	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
-ENDIF (BUILD_SHARED_LIBS)
-
-
-IF (INSTALL_LIBS)
-	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
-		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
-			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL DESTINATION .)
-			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL DESTINATION lib${LIB_SUFFIX})
-				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
-			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
-
-		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL PROPERTIES FRAMEWORK true)
-			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
-		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
-	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
-ENDIF (INSTALL_LIBS)
+IF(APPLE)
+	SUBDIRS(Apple)
+ENDIF()
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/CMakeLists.txt b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/CMakeLists.txt
new file mode 100644
index 000000000..e9f86c2c9
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/CMakeLists.txt
@@ -0,0 +1,75 @@
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+ADD_DEFINITIONS(-DUSE_MINICL)
+
+
+
+
+SET(BulletSoftBodyOpenCLSolvers_SRCS
+	../btSoftBodySolver_OpenCL.cpp
+)
+
+SET(BulletSoftBodyOpenCLSolvers_HDRS
+	../btSoftBodySolver_OpenCL.h
+	../../CPU/btSoftBodySolverData.h
+	../btSoftBodySolverVertexData_OpenCL.h
+	../btSoftBodySolverTriangleData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverBuffer_OpenCL.h
+)
+
+# OpenCL and HLSL Shaders.
+# Build rules generated to stringify these into headers
+# which are needed by some of the sources
+SET(BulletSoftBodyOpenCLSolvers_Shaders
+#	OutputToVertexArray
+	UpdateNormals
+	Integrate
+	UpdatePositions
+	UpdateNodes
+	SolvePositions
+	UpdatePositionsFromVelocities
+	ApplyForces
+	PrepareLinks
+	VSolveLinks
+)
+
+foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC10/${f}.cl")
+endforeach(f) 
+
+
+
+ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_Mini
+	${BulletSoftBodyOpenCLSolvers_SRCS} 
+	${BulletSoftBodyOpenCLSolvers_HDRS} 
+	${BulletSoftBodyOpenCLSolvers_OpenCLC}
+)
+
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES SOVERSION ${BULLET_VERSION})
+IF (BUILD_SHARED_LIBS)
+	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
+ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Mini DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Mini DESTINATION lib${LIB_SUFFIX})
+				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
new file mode 100644
index 000000000..79b0ac234
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
@@ -0,0 +1,40 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include <MiniCL/cl_MiniCL_Defs.h>
+
+#define MSTRINGIFY(A) A
+#include "../OpenCLC10/ApplyForces.cl"
+#include "../OpenCLC10/Integrate.cl"
+#include "../OpenCLC10/PrepareLinks.cl"
+#include "../OpenCLC10/SolvePositions.cl"
+#include "../OpenCLC10/UpdateNodes.cl"
+#include "../OpenCLC10/UpdateNormals.cl"
+#include "../OpenCLC10/UpdatePositions.cl"
+#include "../OpenCLC10/UpdatePositionsFromVelocities.cl"
+//#include "../OpenCLC10/VSolveLinks.cl"
+
+MINICL_REGISTER(PrepareLinksKernel)
+MINICL_REGISTER(UpdatePositionsFromVelocitiesKernel)
+MINICL_REGISTER(SolvePositionsFromLinksKernel)
+MINICL_REGISTER(updateVelocitiesFromPositionsWithVelocitiesKernel)
+MINICL_REGISTER(updateVelocitiesFromPositionsWithoutVelocitiesKernel)
+MINICL_REGISTER(IntegrateKernel)
+MINICL_REGISTER(ApplyForcesKernel)
+MINICL_REGISTER(ResetNormalsAndAreasKernel)
+MINICL_REGISTER(NormalizeNormalsAndAreasKernel)
+MINICL_REGISTER(UpdateSoftBodiesKernel)
+
+
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt
new file mode 100644
index 000000000..7608492b7
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt
@@ -0,0 +1,79 @@
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+
+
+IF(INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+	INCLUDE_DIRECTORIES( $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/inc	)
+ELSE()
+	INCLUDE_DIRECTORIES( $ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/inc	)
+ENDIF()
+
+
+SET(BulletSoftBodyOpenCLSolvers_SRCS
+	../btSoftBodySolver_OpenCL.cpp
+)
+
+SET(BulletSoftBodyOpenCLSolvers_HDRS
+	../btSoftBodySolver_OpenCL.h
+	../../CPU/btSoftBodySolverData.h
+	../btSoftBodySolverVertexData_OpenCL.h
+	../btSoftBodySolverTriangleData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverBuffer_OpenCL.h
+)
+
+# OpenCL and HLSL Shaders.
+# Build rules generated to stringify these into headers
+# which are needed by some of the sources
+SET(BulletSoftBodyOpenCLSolvers_Shaders
+#	OutputToVertexArray
+	UpdateNormals
+	Integrate
+	UpdatePositions
+	UpdateNodes
+	SolvePositions
+	UpdatePositionsFromVelocities
+	ApplyForces
+	PrepareLinks
+	VSolveLinks
+)
+
+foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
+    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC/${f}.cl")
+endforeach(f) 
+
+
+
+ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_NVidia
+	${BulletSoftBodyOpenCLSolvers_SRCS} 
+	${BulletSoftBodyOpenCLSolvers_HDRS} 
+	${BulletSoftBodyOpenCLSolvers_OpenCLC}
+)
+
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES SOVERSION ${BULLET_VERSION})
+IF (BUILD_SHARED_LIBS)
+	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
+ENDIF (BUILD_SHARED_LIBS)
+
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_NVidia DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_NVidia DESTINATION lib${LIB_SUFFIX})
+				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ApplyForces.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ApplyForces.cl
new file mode 100644
index 000000000..555d07a1d
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ApplyForces.cl
@@ -0,0 +1,91 @@
+MSTRINGIFY(
+
+
+float adot3(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+float4 projectOnAxis( float4 v, float4 a )
+{
+	return (a*adot3(v, a));
+}
+
+__kernel void 
+ApplyForcesKernel(
+	const uint numNodes,
+	const float solverdt,
+	const float epsilon,
+	__global int * g_vertexClothIdentifier,
+	__global float4 * g_vertexNormal,
+	__global float * g_vertexArea,
+	__global float * g_vertexInverseMass,
+	__global float * g_clothLiftFactor,
+	__global float * g_clothDragFactor,
+	__global float4 * g_clothWindVelocity,
+	__global float4 * g_clothAcceleration,
+	__global float * g_clothMediumDensity,
+	__global float4 * g_vertexForceAccumulator,
+	__global float4 * g_vertexVelocity GUID_ARG)
+{
+	unsigned int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{		
+		int clothId  = g_vertexClothIdentifier[nodeID];
+		float nodeIM = g_vertexInverseMass[nodeID];
+		
+		if( nodeIM > 0.0f )
+		{
+			float4 nodeV  = g_vertexVelocity[nodeID];
+			float4 normal = g_vertexNormal[nodeID];
+			float area    = g_vertexArea[nodeID];
+			float4 nodeF  = g_vertexForceAccumulator[nodeID];
+			
+			// Read per-cloth values
+			float4 clothAcceleration = g_clothAcceleration[clothId];
+			float4 clothWindVelocity = g_clothWindVelocity[clothId];
+			float liftFactor = g_clothLiftFactor[clothId];
+			float dragFactor = g_clothDragFactor[clothId];
+			float mediumDensity = g_clothMediumDensity[clothId];
+		
+			// Apply the acceleration to the cloth rather than do this via a force
+			nodeV += (clothAcceleration*solverdt);
+
+			g_vertexVelocity[nodeID] = nodeV;
+
+			float4 relativeWindVelocity = nodeV - clothWindVelocity;
+			float relativeSpeedSquared = dot(relativeWindVelocity, relativeWindVelocity);
+			
+			if( relativeSpeedSquared > epsilon )
+			{
+				// Correct direction of normal relative to wind direction and get dot product
+				normal = normal * (dot(normal, relativeWindVelocity) < 0 ? -1.f : 1.f);
+				float dvNormal = dot(normal, relativeWindVelocity);
+				if( dvNormal > 0 )
+				{
+					float4 force = (float4)(0.f, 0.f, 0.f, 0.f);
+					float c0 = area * dvNormal * relativeSpeedSquared / 2.f;
+					float c1 = c0 * mediumDensity;
+					force += normal * (-c1 * liftFactor);
+					force += normalize(relativeWindVelocity)*(-c1 * dragFactor);
+					
+					float dtim = solverdt * nodeIM;
+					float4 forceDTIM = force * dtim;
+					
+					float4 nodeFPlusForce = nodeF + force;
+					
+					// m_nodesf[i] -= ProjectOnAxis(m_nodesv[i], force.normalized())/dtim;	
+					float4 nodeFMinus = nodeF - (projectOnAxis(nodeV, normalize(force))/dtim);
+					
+					nodeF = nodeFPlusForce;
+					if( dot(forceDTIM, forceDTIM) > dot(nodeV, nodeV) )
+						nodeF = nodeFMinus;
+									
+					g_vertexForceAccumulator[nodeID] = nodeF;	
+				}
+			}
+		}
+	}
+}
+
+);
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/Integrate.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/Integrate.cl
new file mode 100644
index 000000000..fb65330d9
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/Integrate.cl
@@ -0,0 +1,35 @@
+MSTRINGIFY(
+
+// Node indices for each link
+
+
+
+__kernel void
+IntegrateKernel( 
+	const int numNodes,
+	const float solverdt,
+	__global float * g_vertexInverseMasses,
+	__global float4 * g_vertexPositions,
+	__global float4 * g_vertexVelocity,
+	__global float4 * g_vertexPreviousPositions,
+	__global float4 * g_vertexForceAccumulator GUID_ARG)
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{	
+		float4 position   = g_vertexPositions[nodeID];
+		float4 velocity   = g_vertexVelocity[nodeID];
+		float4 force      = g_vertexForceAccumulator[nodeID];
+		float inverseMass = g_vertexInverseMasses[nodeID];
+		
+		g_vertexPreviousPositions[nodeID] = position;
+		velocity += force * inverseMass * solverdt;
+		position += velocity * solverdt;
+		
+		g_vertexForceAccumulator[nodeID] = (float4)(0.f, 0.f, 0.f, 0.0f);
+		g_vertexPositions[nodeID]        = position;
+		g_vertexVelocity[nodeID]         = velocity;	
+	}
+}
+
+);
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/PrepareLinks.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/PrepareLinks.cl
new file mode 100644
index 000000000..ba3277667
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/PrepareLinks.cl
@@ -0,0 +1,41 @@
+MSTRINGIFY(
+
+float dot3(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+
+__kernel void 
+PrepareLinksKernel( 
+	const int numLinks,
+	__global int2 * g_linksVertexIndices,
+	__global float * g_linksMassLSC,
+	__global float4 * g_nodesPreviousPosition,
+	__global float * g_linksLengthRatio,
+	__global float4 * g_linksCurrentLength GUID_ARG)
+{
+	int linkID = get_global_id(0);
+	if( linkID < numLinks )
+	{	
+		
+		int2 nodeIndices = g_linksVertexIndices[linkID];
+		int node0 = nodeIndices.x;
+		int node1 = nodeIndices.y;
+		
+		float4 nodePreviousPosition0 = g_nodesPreviousPosition[node0];
+		float4 nodePreviousPosition1 = g_nodesPreviousPosition[node1];
+
+		float massLSC = g_linksMassLSC[linkID];
+		
+		float4 linkCurrentLength = nodePreviousPosition1 - nodePreviousPosition0;
+		
+		float linkLengthRatio = dot3(linkCurrentLength, linkCurrentLength)*massLSC;
+		linkLengthRatio = 1.0f/linkLengthRatio;
+		
+		g_linksCurrentLength[linkID] = linkCurrentLength;
+		g_linksLengthRatio[linkID]   = linkLengthRatio;		
+	}
+}
+
+);
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositions.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositions.cl
new file mode 100644
index 000000000..fe7aec66e
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositions.cl
@@ -0,0 +1,57 @@
+
+
+
+MSTRINGIFY(
+
+
+float mydot3(float4 a, float4 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+
+__kernel void 
+SolvePositionsFromLinksKernel( 
+	const int startLink,
+	const int numLinks,
+	const float kst,
+	const float ti,
+	__global int2 * g_linksVertexIndices,
+	__global float * g_linksMassLSC,
+	__global float * g_linksRestLengthSquared,
+	__global float * g_verticesInverseMass,
+	__global float4 * g_vertexPositions GUID_ARG)
+	
+{
+	int linkID = get_global_id(0) + startLink;
+	if( get_global_id(0) < numLinks )
+	{	
+		float massLSC = g_linksMassLSC[linkID];
+		float restLengthSquared = g_linksRestLengthSquared[linkID];
+		
+		if( massLSC > 0.0f )
+		{		
+			int2 nodeIndices = g_linksVertexIndices[linkID];
+			int node0 = nodeIndices.x;
+			int node1 = nodeIndices.y;
+			
+			float4 position0 = g_vertexPositions[node0];
+			float4 position1 = g_vertexPositions[node1];
+
+			float inverseMass0 = g_verticesInverseMass[node0];
+			float inverseMass1 = g_verticesInverseMass[node1]; 
+
+			float4 del = position1 - position0;
+			float len  = mydot3(del, del);
+			float k    = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
+			position0 = position0 - del*(k*inverseMass0);
+			position1 = position1 + del*(k*inverseMass1);
+
+			g_vertexPositions[node0] = position0;
+			g_vertexPositions[node1] = position1;
+
+		}
+	}
+}
+
+);
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateConstants.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateConstants.cl
new file mode 100644
index 000000000..488a58479
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateConstants.cl
@@ -0,0 +1,44 @@
+MSTRINGIFY(
+
+/*#define float3 float4
+
+float dot3(float3 a, float3 b)
+{
+   return a.x*b.x + a.y*b.y + a.z*b.z;
+}*/
+
+__kernel void 
+UpdateConstantsKernel( 
+	const int numLinks,
+	__global int2 * g_linksVertexIndices,
+	__global float4 * g_vertexPositions,
+	__global float * g_vertexInverseMasses,
+	__global float * g_linksMaterialLSC,
+	__global float * g_linksMassLSC,
+	__global float * g_linksRestLengthSquared,
+	__global float * g_linksRestLengths)
+{
+	int linkID = get_global_id(0);
+	if( linkID < numLinks )
+	{	
+		int2 nodeIndices = g_linksVertexIndices[linkID];
+		int node0 = nodeIndices.x;
+		int node1 = nodeIndices.y;
+		float linearStiffnessCoefficient = g_linksMaterialLSC[ linkID ];
+		
+		float3 position0   = g_vertexPositions[node0].xyz;
+		float3 position1   = g_vertexPositions[node1].xyz;
+		float inverseMass0 = g_vertexInverseMasses[node0];
+		float inverseMass1 = g_vertexInverseMasses[node1];
+
+		float3 difference = position0 - position1;
+		float length2 = dot(difference, difference);
+		float length = sqrt(length2);
+	
+		g_linksRestLengths[linkID] = length;
+		g_linksMassLSC[linkID] = (inverseMass0 + inverseMass1)/linearStiffnessCoefficient;
+		g_linksRestLengthSquared[linkID] = length*length;		
+	}
+}
+
+);
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNodes.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNodes.cl
new file mode 100644
index 000000000..9ad227b45
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNodes.cl
@@ -0,0 +1,39 @@
+MSTRINGIFY(
+
+
+__kernel void 
+updateVelocitiesFromPositionsWithVelocitiesKernel( 
+	int numNodes,
+	float isolverdt,
+	__global float4 * g_vertexPositions,
+	__global float4 * g_vertexPreviousPositions,
+	__global int * g_vertexClothIndices,
+	__global float *g_clothVelocityCorrectionCoefficients,
+	__global float * g_clothDampingFactor,
+	__global float4 * g_vertexVelocities,
+	__global float4 * g_vertexForces GUID_ARG)
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{	
+		float4 position = g_vertexPositions[nodeID];
+		float4 previousPosition = g_vertexPreviousPositions[nodeID];
+		float4 velocity = g_vertexVelocities[nodeID];
+		int clothIndex = g_vertexClothIndices[nodeID];
+		float velocityCorrectionCoefficient = g_clothVelocityCorrectionCoefficients[clothIndex];
+		float dampingFactor = g_clothDampingFactor[clothIndex];
+		float velocityCoefficient = (1.f - dampingFactor);
+		
+		float4 difference = position - previousPosition;
+				
+		velocity += difference*velocityCorrectionCoefficient*isolverdt;
+		
+		// Damp the velocity
+		velocity *= velocityCoefficient;
+		
+		g_vertexVelocities[nodeID] = velocity;
+		g_vertexForces[nodeID] = (float4)(0.f, 0.f, 0.f, 0.f);								
+	}
+}
+
+);
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNormals.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNormals.cl
new file mode 100644
index 000000000..7bb233413
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNormals.cl
@@ -0,0 +1,102 @@
+MSTRINGIFY(
+
+float length3(float4 a)
+{
+	a.w = 0;
+	return length(a);
+}
+
+float4 normalize3(float4 a)
+{
+	a.w = 0;
+	return normalize(a);
+}
+
+__kernel void 
+ResetNormalsAndAreasKernel(
+	const unsigned int numNodes,
+	__global float4 * g_vertexNormals,
+	__global float * g_vertexArea GUID_ARG)
+{
+	if( get_global_id(0) < numNodes )
+	{
+		g_vertexNormals[get_global_id(0)] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
+		g_vertexArea[get_global_id(0)]    = 0.0f;
+	}
+}
+
+
+__kernel void 
+UpdateSoftBodiesKernel(
+	const unsigned int startFace,
+	const unsigned int numFaces,
+	__global int4 * g_triangleVertexIndexSet,
+	__global float4 * g_vertexPositions,
+	__global float4 * g_vertexNormals,
+	__global float * g_vertexArea,
+	__global float4 * g_triangleNormals,
+	__global float * g_triangleArea GUID_ARG)
+{
+	int faceID = get_global_id(0) + startFace;
+	if( get_global_id(0) < numFaces )
+	{		
+		int4 triangleIndexSet = g_triangleVertexIndexSet[ faceID ];
+		int nodeIndex0 = triangleIndexSet.x;
+		int nodeIndex1 = triangleIndexSet.y;
+		int nodeIndex2 = triangleIndexSet.z;
+
+		float4 node0 = g_vertexPositions[nodeIndex0];
+		float4 node1 = g_vertexPositions[nodeIndex1];
+		float4 node2 = g_vertexPositions[nodeIndex2];
+		float4 nodeNormal0 = g_vertexNormals[nodeIndex0];
+		float4 nodeNormal1 = g_vertexNormals[nodeIndex1];
+		float4 nodeNormal2 = g_vertexNormals[nodeIndex2];
+		float vertexArea0 = g_vertexArea[nodeIndex0];
+		float vertexArea1 = g_vertexArea[nodeIndex1];
+		float vertexArea2 = g_vertexArea[nodeIndex2];
+		
+		float4 vector0 = node1 - node0;
+		float4 vector1 = node2 - node0;
+		
+		float4 faceNormal = cross(vector0, vector1);
+		float triangleArea = length(faceNormal);
+
+		nodeNormal0 = nodeNormal0 + faceNormal;
+		nodeNormal1 = nodeNormal1 + faceNormal;
+		nodeNormal2 = nodeNormal2 + faceNormal;
+		vertexArea0 = vertexArea0 + triangleArea;
+		vertexArea1 = vertexArea1 + triangleArea;
+		vertexArea2 = vertexArea2 + triangleArea;
+		
+		g_triangleNormals[faceID] = normalize3(faceNormal);
+		g_vertexNormals[nodeIndex0] = nodeNormal0;
+		g_vertexNormals[nodeIndex1] = nodeNormal1;
+		g_vertexNormals[nodeIndex2] = nodeNormal2;
+		g_triangleArea[faceID] = triangleArea;
+		g_vertexArea[nodeIndex0] = vertexArea0;
+		g_vertexArea[nodeIndex1] = vertexArea1;
+		g_vertexArea[nodeIndex2] = vertexArea2;
+	}
+}
+
+__kernel void 
+NormalizeNormalsAndAreasKernel( 
+	const unsigned int numNodes,
+	__global int * g_vertexTriangleCount,
+	__global float4 * g_vertexNormals,
+	__global float * g_vertexArea GUID_ARG)
+{
+	if( get_global_id(0) < numNodes )
+	{
+		float4 normal = g_vertexNormals[get_global_id(0)];
+		float area = g_vertexArea[get_global_id(0)];
+		int numTriangles = g_vertexTriangleCount[get_global_id(0)];
+		
+		float vectorLength = length3(normal);
+		
+		g_vertexNormals[get_global_id(0)] = normalize3(normal);
+		g_vertexArea[get_global_id(0)] = area/(float)(numTriangles);
+	}
+}
+
+);
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositions.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositions.cl
new file mode 100644
index 000000000..3155a04e4
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositions.cl
@@ -0,0 +1,34 @@
+MSTRINGIFY(
+
+__kernel void 
+updateVelocitiesFromPositionsWithoutVelocitiesKernel( 
+	const int numNodes,
+	const float isolverdt,
+	__global float4 * g_vertexPositions,
+	__global float4 * g_vertexPreviousPositions,
+	__global int * g_vertexClothIndices,
+	__global float * g_clothDampingFactor,
+	__global float4 * g_vertexVelocities,
+	__global float4 * g_vertexForces GUID_ARG)
+
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{	
+		float4 position = g_vertexPositions[nodeID];
+		float4 previousPosition = g_vertexPreviousPositions[nodeID];
+		float4 velocity = g_vertexVelocities[nodeID];
+		int clothIndex = g_vertexClothIndices[nodeID];
+		float dampingFactor = g_clothDampingFactor[clothIndex];
+		float velocityCoefficient = (1.f - dampingFactor);
+		
+		float4 difference = position - previousPosition;
+				
+		velocity = difference*velocityCoefficient*isolverdt;		
+		
+		g_vertexVelocities[nodeID] = velocity;
+		g_vertexForces[nodeID] = (float4)(0.f, 0.f, 0.f, 0.f);								
+	}
+}
+
+);
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositionsFromVelocities.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositionsFromVelocities.cl
new file mode 100644
index 000000000..97e708bc3
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositionsFromVelocities.cl
@@ -0,0 +1,28 @@
+
+MSTRINGIFY(
+
+
+
+
+__kernel void 
+UpdatePositionsFromVelocitiesKernel( 
+	const int numNodes,
+	const float solverSDT,
+	__global float4 * g_vertexVelocities,
+	__global float4 * g_vertexPreviousPositions,
+	__global float4 * g_vertexCurrentPosition GUID_ARG)
+{
+	int vertexID = get_global_id(0);
+	if( vertexID < numNodes )
+	{	
+		float4 previousPosition = g_vertexPreviousPositions[vertexID];
+		float4 velocity         = g_vertexVelocities[vertexID];
+		
+		float4 newPosition      = previousPosition + velocity*solverSDT;
+		
+		g_vertexCurrentPosition[vertexID]   = newPosition;
+		g_vertexPreviousPositions[vertexID] = newPosition;
+	}
+}
+
+);
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/VSolveLinks.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/VSolveLinks.cl
new file mode 100644
index 000000000..a618d69cc
--- /dev/null
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/VSolveLinks.cl
@@ -0,0 +1,45 @@
+MSTRINGIFY(
+
+__kernel void 
+VSolveLinksKernel( 
+	int startLink,
+	int numLinks,
+	float kst,
+	__global int2 * g_linksVertexIndices,
+	__global float * g_linksLengthRatio,
+	__global float4 * g_linksCurrentLength,
+	__global float * g_vertexInverseMass,
+	__global float4 * g_vertexVelocity GUID_ARG)
+{
+	int linkID = get_global_id(0) + startLink;
+	if( get_global_id(0) < numLinks )
+	{		
+		int2 nodeIndices = g_linksVertexIndices[linkID];
+		int node0 = nodeIndices.x;
+		int node1 = nodeIndices.y;
+		
+		float linkLengthRatio = g_linksLengthRatio[linkID];
+		float3 linkCurrentLength = g_linksCurrentLength[linkID].xyz;
+		
+		float3 vertexVelocity0 = g_vertexVelocity[node0].xyz;
+		float3 vertexVelocity1 = g_vertexVelocity[node1].xyz;
+
+		float vertexInverseMass0 = g_vertexInverseMass[node0];
+		float vertexInverseMass1 = g_vertexInverseMass[node1]; 
+
+		float3 nodeDifference = vertexVelocity0 - vertexVelocity1;
+		float dotResult = dot(linkCurrentLength, nodeDifference);
+		float j = -dotResult*linkLengthRatio*kst;
+		
+		float3 velocityChange0 = linkCurrentLength*(j*vertexInverseMass0);
+		float3 velocityChange1 = linkCurrentLength*(j*vertexInverseMass1);
+		
+		vertexVelocity0 += velocityChange0;
+		vertexVelocity1 -= velocityChange1;
+
+		g_vertexVelocity[node0] = (float4)(vertexVelocity0, 0.f);
+		g_vertexVelocity[node1] = (float4)(vertexVelocity1, 0.f);
+	}
+}
+
+);
\ No newline at end of file
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h
index e71ae8778..8fa58cd16 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h
@@ -17,7 +17,16 @@ subject to the following restrictions:
 #define BT_SOFT_BODY_SOLVER_BUFFER_OPENCL_H
 
 // OpenCL support
-#include <CL/cl.hpp>
+
+#ifdef USE_MINICL
+	#include "MiniCL/cl.h"
+#else //USE_MINICL
+	#ifdef __APPLE__
+		#include <OpenCL/OpenCL.h>
+	#else
+		#include <CL/cl.h>
+	#endif //__APPLE__
+#endif//USE_MINICL
 
 #ifndef SAFE_RELEASE
 #define SAFE_RELEASE(p)      { if(p) { (p)->Release(); (p)=NULL; } }
@@ -25,22 +34,25 @@ subject to the following restrictions:
 
 template <typename ElementType> class btOpenCLBuffer
 {
-protected:
-	cl::CommandQueue m_queue;
-	btAlignedObjectArray< ElementType > * m_CPUBuffer;
-	cl::Buffer m_buffer;
+public:
 
+	cl_command_queue	m_cqCommandQue;
+	cl_context			m_clContext;
+	cl_mem				m_buffer;
+
+
+
+	btAlignedObjectArray< ElementType > * m_CPUBuffer;
+	
 	int  m_gpuSize;
 	bool m_onGPU;
-
 	bool m_readOnlyOnGPU;
-
 	bool m_allocated;
-	// TODO: Remove this once C++ bindings are fixed
-	cl::Context context;
 
-	bool createBuffer( cl::Buffer *preexistingBuffer = 0)
+
+	bool createBuffer( cl_mem* preexistingBuffer = 0)
 	{
+
 		cl_int err;
 		 
 
@@ -49,12 +61,11 @@ protected:
 			m_buffer = *preexistingBuffer;
 		} 
 		else {
-			m_buffer = cl::Buffer(
-					context, 
-					m_readOnlyOnGPU ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE, 
-					m_CPUBuffer->size() * sizeof(ElementType), 
-					0, 
-					&err);
+
+			cl_mem_flags flags= m_readOnlyOnGPU ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
+
+			size_t size = m_CPUBuffer->size() * sizeof(ElementType);
+			m_buffer = clCreateBuffer(m_clContext, flags, size, 0, &err);
 			if( err != CL_SUCCESS )
 			{
 				btAssert( "Buffer::Buffer(m_buffer)");
@@ -62,35 +73,31 @@ protected:
 		}
 
 		m_gpuSize = m_CPUBuffer->size();
+
 		return true;
 	}
 
 public:
-	btOpenCLBuffer( 
-		cl::CommandQueue queue,
-		btAlignedObjectArray< ElementType > *CPUBuffer, 
-		bool readOnly) :
-		m_queue(queue),
+	btOpenCLBuffer( cl_command_queue	commandQue,cl_context ctx, btAlignedObjectArray< ElementType >* CPUBuffer, bool readOnly)
+		:m_cqCommandQue(commandQue),
+		m_clContext(ctx),
 		m_CPUBuffer(CPUBuffer),
 		m_gpuSize(0),
 		m_onGPU(false),
 		m_readOnlyOnGPU(readOnly),
 		m_allocated(false)
 	{
-		context = m_queue.getInfo<CL_QUEUE_CONTEXT>();
 	}
 
 	~btOpenCLBuffer()
 	{
 	}
 
-	cl::Buffer getBuffer()
-	{
-		return m_buffer;
-	}
 
 	bool moveToGPU()
 	{
+
+
 		cl_int err;
 
 		if( (m_CPUBuffer->size() != m_gpuSize) )
@@ -107,12 +114,12 @@ public:
 				m_allocated = true;
 			}
 			
-			err = m_queue.enqueueWriteBuffer(
-				m_buffer,
+			size_t size = m_CPUBuffer->size() * sizeof(ElementType);
+			err = clEnqueueWriteBuffer(m_cqCommandQue,m_buffer,
 				CL_FALSE,
 				0,
-				m_CPUBuffer->size() * sizeof(ElementType), 
-				&((*m_CPUBuffer)[0]));
+				size, 
+				&((*m_CPUBuffer)[0]),0,0,0);
 			if( err != CL_SUCCESS )
 			{
 				btAssert( "CommandQueue::enqueueWriteBuffer(m_buffer)" );
@@ -122,20 +129,23 @@ public:
 		}
 
 		return true;
+
 	}
 
 	bool moveFromGPU()
 	{
+
 		cl_int err;
 
 		if (m_CPUBuffer->size() > 0) {
 			if (m_onGPU && !m_readOnlyOnGPU) {
-				err = m_queue.enqueueReadBuffer(
+				size_t size = m_CPUBuffer->size() * sizeof(ElementType);
+				err = clEnqueueReadBuffer(m_cqCommandQue,
 					m_buffer,
 					CL_TRUE,
 					0,
-					m_CPUBuffer->size() * sizeof(ElementType), 
-					&((*m_CPUBuffer)[0]));
+					size,
+					&((*m_CPUBuffer)[0]),0,0,0);
 
 				if( err != CL_SUCCESS )
 				{
@@ -151,16 +161,17 @@ public:
 
 	bool copyFromGPU()
 	{
+
 		cl_int err;
+		size_t size = m_CPUBuffer->size() * sizeof(ElementType);
 
 		if (m_CPUBuffer->size() > 0) {
 			if (m_onGPU && !m_readOnlyOnGPU) {
-				err = m_queue.enqueueReadBuffer(
+				err = clEnqueueReadBuffer(m_cqCommandQue,
 					m_buffer,
 					CL_TRUE,
-					0,
-					m_CPUBuffer->size() * sizeof(ElementType), 
-					&((*m_CPUBuffer)[0]));
+					0,size, 
+					&((*m_CPUBuffer)[0]),0,0,0);
 
 				if( err != CL_SUCCESS )
 				{
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h
index 6c270c5b5..cef924f6f 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h
@@ -13,8 +13,8 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-#include "BulletSoftBody/Solvers/CPU/btSoftBodySolverData.h"
-#include "BulletSoftBody/Solvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h"
+#include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
 
 
 #ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_H
@@ -25,7 +25,9 @@ class btSoftBodyLinkDataOpenCL : public btSoftBodyLinkData
 {
 public:
 	bool				m_onGPU;
-	cl::CommandQueue	m_queue;
+
+	cl_command_queue	m_cqCommandQue;
+
 
 	btOpenCLBuffer<LinkNodePair> m_clLinks;
 	btOpenCLBuffer<float>							      m_clLinkStrength;
@@ -36,6 +38,24 @@ public:
 	btOpenCLBuffer<float>								  m_clLinksRestLength;
 	btOpenCLBuffer<float>								  m_clLinksMaterialLinearStiffnessCoefficient;
 
+	struct BatchPair
+	{
+		int start;
+		int length;
+
+		BatchPair() :
+			start(0),
+			length(0)
+		{
+		}
+
+		BatchPair( int s, int l ) : 
+			start( s ),
+			length( l )
+		{
+		}
+	};
+
 	/**
 	 * Link addressing information for each cloth.
 	 * Allows link locations to be computed independently of data batching.
@@ -45,9 +65,9 @@ public:
 	/**
 	 * Start and length values for computation batches over link data.
 	 */
-	btAlignedObjectArray< std::pair< int, int > >		m_batchStartLengths;
+	btAlignedObjectArray< BatchPair >		m_batchStartLengths;
 
-	btSoftBodyLinkDataOpenCL(cl::CommandQueue queue);
+	btSoftBodyLinkDataOpenCL(cl_command_queue queue, cl_context ctx);
 
 	virtual ~btSoftBodyLinkDataOpenCL();
 
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h
index 4bc9215ea..e1094e38a 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h
@@ -14,8 +14,8 @@ subject to the following restrictions:
 */
 
 
-#include "BulletSoftBody/Solvers/CPU/btSoftBodySolverData.h"
-#include "BulletSoftBody/Solvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h"
+#include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
 
 
 #ifndef BT_SOFT_BODY_SOLVER_TRIANGLE_DATA_OPENCL_H
@@ -26,7 +26,7 @@ class btSoftBodyTriangleDataOpenCL : public btSoftBodyTriangleData
 {
 public:
 	bool				m_onGPU;
-	cl::CommandQueue    m_queue;
+	cl_command_queue    m_queue;
 
 	btOpenCLBuffer<btSoftBodyTriangleData::TriangleNodeSet>					m_clVertexIndices;
 	btOpenCLBuffer<float>								m_clArea;
@@ -41,10 +41,20 @@ public:
 	/**
 	 * Start and length values for computation batches over link data.
 	 */
-	btAlignedObjectArray< std::pair< int, int > >		m_batchStartLengths;
+	struct btSomePair
+	{
+		btSomePair() {}
+		btSomePair(int f,int s)
+			:first(f),second(s)
+		{
+		}
+		int first;
+		int second;
+	};
+	btAlignedObjectArray< btSomePair >		m_batchStartLengths;
 
 public:
-	btSoftBodyTriangleDataOpenCL( cl::CommandQueue queue );
+	btSoftBodyTriangleDataOpenCL( cl_command_queue queue, cl_context ctx );
 
 	virtual ~btSoftBodyTriangleDataOpenCL();
 
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
index 8f65c9de4..24997e726 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
@@ -13,8 +13,8 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-#include "BulletSoftBody/Solvers/CPU/btSoftBodySolverData.h"
-#include "BulletSoftBody/Solvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h"
+#include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
 
 #ifndef BT_SOFT_BODY_SOLVER_VERTEX_DATA_OPENCL_H
 #define BT_SOFT_BODY_SOLVER_VERTEX_DATA_OPENCL_H
@@ -24,7 +24,7 @@ class btSoftBodyVertexDataOpenCL : public btSoftBodyVertexData
 {
 protected:
 	bool		m_onGPU;
-	cl::CommandQueue m_queue;
+	cl_command_queue	m_queue;
 
 public:
 	btOpenCLBuffer<int>									m_clClothIdentifier;
@@ -37,7 +37,7 @@ public:
 	btOpenCLBuffer<float>									m_clVertexArea;
 	btOpenCLBuffer<int>									m_clVertexTriangleCount;
 public:
-	btSoftBodyVertexDataOpenCL( cl::CommandQueue queue);
+	btSoftBodyVertexDataOpenCL( cl_command_queue queue,  cl_context ctx);
 
 	virtual ~btSoftBodyVertexDataOpenCL();
 
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
index 31b52f679..8198a12d3 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
@@ -16,10 +16,18 @@ subject to the following restrictions:
 
 #include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
 #include "vectormath/vmInclude.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolver_OpenCL.h"
-#include "BulletSoftBody/VertexBuffers/btSoftBodySolverVertexBuffer.h"
+#include <stdio.h> //@todo: remove the debugging printf at some stage
+#include "btSoftBodySolver_OpenCL.h"
+#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
 #include "BulletSoftBody/btSoftBody.h"
 
+     static const size_t workGroupSize = 128;
+
+
+//CL_VERSION_1_1 seems broken on NVidia SDK so just disable it
+
+#if (0)//CL_VERSION_1_1 == 1)
+ //OpenCL 1.1 kernels use float3
 #define MSTRINGIFY(A) #A
 static char* PrepareLinksCLString = 
 #include "OpenCLC/PrepareLinks.cl"
@@ -41,19 +49,43 @@ static char* UpdateNormalsCLString =
 #include "OpenCLC/UpdateNormals.cl"
 static char* VSolveLinksCLString = 
 #include "OpenCLC/VSolveLinks.cl"
+#else
+////OpenCL 1.0 kernels don't use float3
+#define MSTRINGIFY(A) #A
+static char* PrepareLinksCLString = 
+#include "OpenCLC10/PrepareLinks.cl"
+static char* UpdatePositionsFromVelocitiesCLString = 
+#include "OpenCLC10/UpdatePositionsFromVelocities.cl"
+static char* SolvePositionsCLString = 
+#include "OpenCLC10/SolvePositions.cl"
+static char* UpdateNodesCLString = 
+#include "OpenCLC10/UpdateNodes.cl"
+static char* UpdatePositionsCLString = 
+#include "OpenCLC10/UpdatePositions.cl"
+static char* UpdateConstantsCLString = 
+#include "OpenCLC10/UpdateConstants.cl"
+static char* IntegrateCLString = 
+#include "OpenCLC10/Integrate.cl"
+static char* ApplyForcesCLString = 
+#include "OpenCLC10/ApplyForces.cl"
+static char* UpdateNormalsCLString = 
+#include "OpenCLC10/UpdateNormals.cl"
+static char* VSolveLinksCLString = 
+#include "OpenCLC10/VSolveLinks.cl"
+#endif //CL_VERSION_1_1
 
 
-btSoftBodyVertexDataOpenCL::btSoftBodyVertexDataOpenCL( cl::CommandQueue queue) :
+btSoftBodyVertexDataOpenCL::btSoftBodyVertexDataOpenCL( cl_command_queue queue, cl_context ctx) :
     m_queue(queue),
-	m_clClothIdentifier( queue, &m_clothIdentifier, false ),
-	m_clVertexPosition( queue, &m_vertexPosition, false ),
-	m_clVertexPreviousPosition( queue, &m_vertexPreviousPosition, false ),
-	m_clVertexVelocity( queue, &m_vertexVelocity, false ),
-	m_clVertexForceAccumulator( queue, &m_vertexForceAccumulator, false ),
-	m_clVertexNormal( queue, &m_vertexNormal, false ),
-	m_clVertexInverseMass( queue, &m_vertexInverseMass, false ),
-	m_clVertexArea( queue, &m_vertexArea, false ),
-	m_clVertexTriangleCount( queue, &m_vertexTriangleCount, false )
+	m_clClothIdentifier( queue, ctx, &m_clothIdentifier, false ),
+	m_clVertexPosition( queue, ctx, &m_vertexPosition, false ),
+	m_clVertexPreviousPosition( queue, ctx, &m_vertexPreviousPosition, false ),
+	m_clVertexVelocity( queue, ctx, &m_vertexVelocity, false ),
+	m_clVertexForceAccumulator( queue, ctx, &m_vertexForceAccumulator, false ),
+	m_clVertexNormal( queue, ctx, &m_vertexNormal, false ),
+	m_clVertexInverseMass( queue, ctx, &m_vertexInverseMass, false ),
+	m_clVertexArea( queue, ctx, &m_vertexArea, false ),
+	m_clVertexTriangleCount( queue, ctx, &m_vertexTriangleCount, false )
 {
 }
 
@@ -108,16 +140,16 @@ bool btSoftBodyVertexDataOpenCL::moveFromAccelerator()
 
 
 
-btSoftBodyLinkDataOpenCL::btSoftBodyLinkDataOpenCL(cl::CommandQueue queue) :
-    m_queue(queue),
-	m_clLinks( queue, &m_links, false ),
-	m_clLinkStrength( queue, &m_linkStrength, false ),
-	m_clLinksMassLSC( queue, &m_linksMassLSC, false ),
-	m_clLinksRestLengthSquared( queue, &m_linksRestLengthSquared, false ),
-	m_clLinksCLength( queue, &m_linksCLength, false ),
-	m_clLinksLengthRatio( queue, &m_linksLengthRatio, false ),
-	m_clLinksRestLength( queue, &m_linksRestLength, false ),
-	m_clLinksMaterialLinearStiffnessCoefficient( queue, &m_linksMaterialLinearStiffnessCoefficient, false )
+btSoftBodyLinkDataOpenCL::btSoftBodyLinkDataOpenCL(cl_command_queue queue,  cl_context ctx) 
+:m_cqCommandQue(queue),
+	m_clLinks( queue, ctx, &m_links, false ),
+	m_clLinkStrength( queue, ctx, &m_linkStrength, false ),
+	m_clLinksMassLSC( queue, ctx, &m_linksMassLSC, false ),
+	m_clLinksRestLengthSquared( queue, ctx, &m_linksRestLengthSquared, false ),
+	m_clLinksCLength( queue, ctx, &m_linksCLength, false ),
+	m_clLinksLengthRatio( queue, ctx, &m_linksLengthRatio, false ),
+	m_clLinksRestLength( queue, ctx, &m_linksRestLength, false ),
+	m_clLinksMaterialLinearStiffnessCoefficient( queue, ctx, &m_linksMaterialLinearStiffnessCoefficient, false )
 {
 }
 
@@ -272,13 +304,13 @@ void btSoftBodyLinkDataOpenCL::generateBatches()
 	if( m_batchStartLengths.size() > 0 )
 	{
 		m_batchStartLengths.resize(batchCounts.size());
-		m_batchStartLengths[0] = std::pair< int, int >( 0, 0 );
+		m_batchStartLengths[0] = BatchPair(0, 0);
 
 		int sum = 0;
 		for( int batchIndex = 0; batchIndex < batchCounts.size(); ++batchIndex )
 		{
-			m_batchStartLengths[batchIndex].first = sum;
-			m_batchStartLengths[batchIndex].second = batchCounts[batchIndex];
+			m_batchStartLengths[batchIndex].start = sum;
+			m_batchStartLengths[batchIndex].length = batchCounts[batchIndex];
 			sum += batchCounts[batchIndex];
 		}
 	}
@@ -313,7 +345,7 @@ void btSoftBodyLinkDataOpenCL::generateBatches()
 		// next element in that batch, incrementing the batch counter
 		// afterwards
 		int batch = batchValues[linkIndex];
-		int newLocation = m_batchStartLengths[batch].first + batchCounts[batch];
+		int newLocation = m_batchStartLengths[batch].start + batchCounts[batch];
 
 		batchCounts[batch] = batchCounts[batch] + 1;
 		m_links[newLocation] = m_links_Backup[linkLocation];
@@ -336,11 +368,11 @@ void btSoftBodyLinkDataOpenCL::generateBatches()
 
 
 
-btSoftBodyTriangleDataOpenCL::btSoftBodyTriangleDataOpenCL( cl::CommandQueue queue ) : 
+btSoftBodyTriangleDataOpenCL::btSoftBodyTriangleDataOpenCL( cl_command_queue queue , cl_context ctx) : 
     m_queue( queue ),
-	m_clVertexIndices( queue, &m_vertexIndices, false ),
-	m_clArea( queue, &m_area, false ),
-	m_clNormal( queue, &m_normal, false )
+	m_clVertexIndices( queue, ctx, &m_vertexIndices, false ),
+	m_clArea( queue, ctx, &m_area, false ),
+	m_clNormal( queue, ctx, &m_normal, false )
 {
 }
 
@@ -493,7 +525,7 @@ void btSoftBodyTriangleDataOpenCL::generateBatches()
 
 
 	m_batchStartLengths.resize(batchCounts.size());
-	m_batchStartLengths[0] = std::pair< int, int >( 0, 0 );
+	m_batchStartLengths[0] = btSomePair(0,0);
 
 
 	int sum = 0;
@@ -547,18 +579,19 @@ void btSoftBodyTriangleDataOpenCL::generateBatches()
 
 
 
-btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(const cl::CommandQueue &queue) :
-	m_linkData(queue),
-	m_vertexData(queue),
-	m_triangleData(queue),
-	m_clPerClothAcceleration(queue, &m_perClothAcceleration, true ),
-	m_clPerClothWindVelocity(queue, &m_perClothWindVelocity, true ),
-	m_clPerClothDampingFactor(queue, &m_perClothDampingFactor, true ),
-	m_clPerClothVelocityCorrectionCoefficient(queue, &m_perClothVelocityCorrectionCoefficient, true ),
-	m_clPerClothLiftFactor(queue, &m_perClothLiftFactor, true ),
-	m_clPerClothDragFactor(queue, &m_perClothDragFactor, true ),
-	m_clPerClothMediumDensity(queue, &m_perClothMediumDensity, true ),
-	m_queue( queue )
+btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_context ctx) :
+	m_linkData(queue, ctx),
+	m_vertexData(queue, ctx),
+	m_triangleData(queue, ctx),
+	m_clPerClothAcceleration(queue, ctx, &m_perClothAcceleration, true ),
+	m_clPerClothWindVelocity(queue, ctx, &m_perClothWindVelocity, true ),
+	m_clPerClothDampingFactor(queue,ctx, &m_perClothDampingFactor, true ),
+	m_clPerClothVelocityCorrectionCoefficient(queue, ctx,&m_perClothVelocityCorrectionCoefficient, true ),
+	m_clPerClothLiftFactor(queue, ctx,&m_perClothLiftFactor, true ),
+	m_clPerClothDragFactor(queue, ctx,&m_perClothDragFactor, true ),
+	m_clPerClothMediumDensity(queue, ctx,&m_perClothMediumDensity, true ),
+	m_cqCommandQue( queue ),
+	m_cxMainContext(ctx)
 {
 	// Initial we will clearly need to update solver constants
 	// For now this is global for the cloths linked with this solver - we should probably make this body specific 
@@ -590,7 +623,7 @@ void btOpenCLSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &sof
 			using Vectormath::Aos::Point3;
 
 			// Create SoftBody that will store the information within the solver
-			btAcceleratedSoftBodyInterface *newSoftBody = new btAcceleratedSoftBodyInterface( softBody );
+			btOpenCLAcceleratedSoftBodyInterface *newSoftBody = new btOpenCLAcceleratedSoftBodyInterface( softBody );
 			m_softBodySet.push_back( newSoftBody );
 
 			m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) );
@@ -712,51 +745,58 @@ bool btOpenCLSoftBodySolver::checkInitialized()
 
 void btOpenCLSoftBodySolver::resetNormalsAndAreas( int numVertices )
 {
-	resetNormalsAndAreasKernel.kernel.setArg(0, numVertices);
-	resetNormalsAndAreasKernel.kernel.setArg(1, m_vertexData.m_clVertexNormal.getBuffer());
-	resetNormalsAndAreasKernel.kernel.setArg(2, m_vertexData.m_clVertexArea.getBuffer());
+	cl_int ciErrNum;
+	ciErrNum = clSetKernelArg(resetNormalsAndAreasKernel, 0, sizeof(numVertices), (void*)&numVertices); //oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	ciErrNum = clSetKernelArg(resetNormalsAndAreasKernel, 1, sizeof(cl_mem), (void*)&m_vertexData.m_clVertexNormal.m_buffer);//oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	ciErrNum = clSetKernelArg(resetNormalsAndAreasKernel,  2, sizeof(cl_mem), (void*)&m_vertexData.m_clVertexArea.m_buffer); //oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	size_t numWorkItems = workGroupSize*((numVertices + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, resetNormalsAndAreasKernel, 1, NULL, &numWorkItems, &workGroupSize, 0,0,0 );
 
-	int	numWorkItems = workGroupSize*((numVertices + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(resetNormalsAndAreasKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS )
+	if( ciErrNum != CL_SUCCESS )
 	{
-		btAssert( "enqueueNDRangeKernel(resetNormalsAndAreasKernel)" );
+		btAssert( 0 && "enqueueNDRangeKernel(resetNormalsAndAreasKernel)" );
 	}
+
 }
 
 void btOpenCLSoftBodySolver::normalizeNormalsAndAreas( int numVertices )
 {
-	normalizeNormalsAndAreasKernel.kernel.setArg(0, numVertices);
-	normalizeNormalsAndAreasKernel.kernel.setArg(1, m_vertexData.m_clVertexTriangleCount.getBuffer());
-	normalizeNormalsAndAreasKernel.kernel.setArg(2, m_vertexData.m_clVertexNormal.getBuffer());
-	normalizeNormalsAndAreasKernel.kernel.setArg(3, m_vertexData.m_clVertexArea.getBuffer());
 
-	int	numWorkItems = workGroupSize*((numVertices + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(normalizeNormalsAndAreasKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
+	cl_int ciErrNum;
+
+	ciErrNum = clSetKernelArg(normalizeNormalsAndAreasKernel, 0, sizeof(int),(void*) &numVertices);
+	ciErrNum = clSetKernelArg(normalizeNormalsAndAreasKernel, 1, sizeof(cl_mem), &m_vertexData.m_clVertexTriangleCount.m_buffer);
+	ciErrNum = clSetKernelArg(normalizeNormalsAndAreasKernel, 2, sizeof(cl_mem), &m_vertexData.m_clVertexNormal.m_buffer);
+	ciErrNum = clSetKernelArg(normalizeNormalsAndAreasKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexArea.m_buffer);
+	size_t	numWorkItems = workGroupSize*((numVertices + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, normalizeNormalsAndAreasKernel, 1, NULL, &numWorkItems, &workGroupSize, 0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert( "enqueueNDRangeKernel(normalizeNormalsAndAreasKernel)");
+		btAssert( 0 && "enqueueNDRangeKernel(normalizeNormalsAndAreasKernel)");
 	}
+
 }
 
 void btOpenCLSoftBodySolver::executeUpdateSoftBodies( int firstTriangle, int numTriangles )
 {
-	updateSoftBodiesKernel.kernel.setArg(0, firstTriangle);
-	updateSoftBodiesKernel.kernel.setArg(1, numTriangles);
-	updateSoftBodiesKernel.kernel.setArg(2, m_triangleData.m_clVertexIndices.getBuffer());
-	updateSoftBodiesKernel.kernel.setArg(3, m_vertexData.m_clVertexPosition.getBuffer());
-	updateSoftBodiesKernel.kernel.setArg(4, m_vertexData.m_clVertexNormal.getBuffer());
-	updateSoftBodiesKernel.kernel.setArg(5, m_vertexData.m_clVertexArea.getBuffer());
-	updateSoftBodiesKernel.kernel.setArg(6, m_triangleData.m_clNormal.getBuffer());
-	updateSoftBodiesKernel.kernel.setArg(7, m_triangleData.m_clArea.getBuffer());
 
+	cl_int ciErrNum;
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 0, sizeof(int), (void*) &firstTriangle);
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 1, sizeof(int), &numTriangles);
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 2, sizeof(cl_mem), &m_triangleData.m_clVertexIndices.m_buffer);
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 4, sizeof(cl_mem), &m_vertexData.m_clVertexNormal.m_buffer);
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexArea.m_buffer);
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 6, sizeof(cl_mem), &m_triangleData.m_clNormal.m_buffer);
+	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 7, sizeof(cl_mem), &m_triangleData.m_clArea.m_buffer);
 
-	int	numWorkItems = workGroupSize*((numTriangles + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(updateSoftBodiesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
+	size_t numWorkItems = workGroupSize*((numTriangles + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, updateSoftBodiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(normalizeNormalsAndAreasKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(normalizeNormalsAndAreasKernel)");
 	}
+
 }
 
 void btOpenCLSoftBodySolver::updateSoftBodies()
@@ -807,6 +847,7 @@ void btOpenCLSoftBodySolver::ApplyClampedForce( float solverdt, const Vectormath
 
 void btOpenCLSoftBodySolver::applyForces( float solverdt )
 {	
+
 	// Ensure data is on accelerator
 	m_vertexData.moveToAccelerator();
 	m_clPerClothAcceleration.moveToGPU();
@@ -815,85 +856,30 @@ void btOpenCLSoftBodySolver::applyForces( float solverdt )
 	m_clPerClothMediumDensity.moveToGPU();
 	m_clPerClothWindVelocity.moveToGPU();			
 
-	cl_int err;
-	err = applyForcesKernel.kernel.setArg(0, m_vertexData.getNumVertices());
-	if( err != CL_SUCCESS ) 
+	cl_int ciErrNum ;
+	int numVerts = m_vertexData.getNumVertices();
+	ciErrNum = clSetKernelArg(applyForcesKernel, 0, sizeof(int), &numVerts);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 1, sizeof(float), &solverdt);
+	float fl = FLT_EPSILON;
+	ciErrNum = clSetKernelArg(applyForcesKernel, 2, sizeof(float), &fl);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 3, sizeof(cl_mem), &m_vertexData.m_clClothIdentifier.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 4, sizeof(cl_mem), &m_vertexData.m_clVertexNormal.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexArea.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 6, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 7, sizeof(cl_mem), &m_clPerClothLiftFactor.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 8 ,sizeof(cl_mem), &m_clPerClothDragFactor.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel, 9, sizeof(cl_mem), &m_clPerClothWindVelocity.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel,10, sizeof(cl_mem), &m_clPerClothAcceleration.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel,11, sizeof(cl_mem), &m_clPerClothMediumDensity.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel,12, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);
+	ciErrNum = clSetKernelArg(applyForcesKernel,13, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
+	size_t numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,applyForcesKernel, 1, NULL, &numWorkItems, &workGroupSize, 0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(1, solverdt);
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(2, FLT_EPSILON);
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(3, m_vertexData.m_clClothIdentifier.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(4, m_vertexData.m_clVertexNormal.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(5, m_vertexData.m_clVertexArea.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(6, m_vertexData.m_clVertexInverseMass.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(7, m_clPerClothLiftFactor.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(8, m_clPerClothDragFactor.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(9, m_clPerClothWindVelocity.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(10, m_clPerClothAcceleration.getBuffer());
-	if( err != CL_SUCCESS )
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(11, m_clPerClothMediumDensity.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(12, m_vertexData.m_clVertexForceAccumulator.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
-	err = applyForcesKernel.kernel.setArg(13, m_vertexData.m_clVertexVelocity.getBuffer());
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 
-	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
-
-	err = m_queue.enqueueNDRangeKernel(applyForcesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
-	}
 }
 
 /**
@@ -901,22 +887,26 @@ void btOpenCLSoftBodySolver::applyForces( float solverdt )
  */
 void btOpenCLSoftBodySolver::integrate( float solverdt )
 {
+	
+
 	// Ensure data is on accelerator
 	m_vertexData.moveToAccelerator();
 
-	integrateKernel.kernel.setArg(0, m_vertexData.getNumVertices());
-	integrateKernel.kernel.setArg(1, solverdt);
-	integrateKernel.kernel.setArg(2, m_vertexData.m_clVertexInverseMass.getBuffer());
-	integrateKernel.kernel.setArg(3, m_vertexData.m_clVertexPosition.getBuffer());
-	integrateKernel.kernel.setArg(4, m_vertexData.m_clVertexVelocity.getBuffer());
-	integrateKernel.kernel.setArg(5, m_vertexData.m_clVertexPreviousPosition.getBuffer());
-	integrateKernel.kernel.setArg(6, m_vertexData.m_clVertexForceAccumulator.getBuffer());
+	cl_int ciErrNum;
+	int numVerts = m_vertexData.getNumVertices();
+	ciErrNum = clSetKernelArg(integrateKernel, 0, sizeof(int), &numVerts);
+	ciErrNum = clSetKernelArg(integrateKernel, 1, sizeof(float), &solverdt);
+	ciErrNum = clSetKernelArg(integrateKernel, 2, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
+	ciErrNum = clSetKernelArg(integrateKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
+	ciErrNum = clSetKernelArg(integrateKernel, 4, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
+	ciErrNum = clSetKernelArg(integrateKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
+	ciErrNum = clSetKernelArg(integrateKernel, 6, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);
 
-	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(integrateKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS )
+	size_t numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,integrateKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
+	if( ciErrNum != CL_SUCCESS )
 	{
-		btAssert(  "enqueueNDRangeKernel(integrateKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(integrateKernel)");
 	}
 
 }
@@ -935,6 +925,7 @@ float btOpenCLSoftBodySolver::computeTriangleArea(
 
 void btOpenCLSoftBodySolver::updateConstants( float timeStep )
 {			
+
 	using namespace Vectormath::Aos;
 
 	if( m_updateSolverConstants )
@@ -959,10 +950,12 @@ void btOpenCLSoftBodySolver::updateConstants( float timeStep )
 			m_linkData.getRestLengthSquared(linkIndex) = restLengthSquared;
 		}
 	}
+
 }
 
 void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 {
+
 	using Vectormath::Aos::Vector3;
 	using Vectormath::Aos::Point3;
 	using Vectormath::Aos::lengthSqr;
@@ -988,33 +981,34 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 
 
 
-	// Prepare anchors
-	/*for(i=0,ni=m_anchors.size();i<ni;++i)
+	for( int iteration = 0; iteration < m_numberOfVelocityIterations ; ++iteration )
 	{
-		Anchor&			a=m_anchors[i];
-		const btVector3	ra=a.m_body->getWorldTransform().getBasis()*a.m_local;
-		a.m_c0	=	ImpulseMatrix(	m_sst.sdt,
-			a.m_node->m_im,
-			a.m_body->getInvMass(),
-			a.m_body->getInvInertiaTensorWorld(),
-			ra);
-		a.m_c1	=	ra;
-		a.m_c2	=	m_sst.sdt*a.m_node->m_im;
-		a.m_body->activate();
-	}*/
+		for( int i = 0; i < m_linkData.m_batchStartLengths.size(); ++i )
+		{
+			int startLink = m_linkData.m_batchStartLengths[i].start;
+			int numLinks = m_linkData.m_batchStartLengths[i].length;
 
-	// Really want to combine these into a single loop, don't we? No update in the middle?
-
-	// TODO: Double check what kst is meant to mean - passed in as 1 in the bullet code
+			solveLinksForVelocity( startLink, numLinks, kst );
+		}
+	}
 
+	// Compute new positions from velocity
+	// Also update the previous position so that our position computation is now based on the new position from the velocity solution
+	// rather than based directly on the original positions
+	if( m_numberOfVelocityIterations > 0 )
+	{
+		updateVelocitiesFromPositionsWithVelocities( 1.f/solverdt );
+	} else {
+		updateVelocitiesFromPositionsWithoutVelocities( 1.f/solverdt );
+	}
 
 	// Solve drift
 	for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
 	{
 		for( int i = 0; i < m_linkData.m_batchStartLengths.size(); ++i )
 		{
-			int startLink = m_linkData.m_batchStartLengths[i].first;
-			int numLinks = m_linkData.m_batchStartLengths[i].second;
+			int startLink = m_linkData.m_batchStartLengths[i].start;
+			int numLinks = m_linkData.m_batchStartLengths[i].length;
 
 			solveLinksForPosition( startLink, numLinks, kst, ti );
 		}
@@ -1023,6 +1017,7 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 
 
 	updateVelocitiesFromPositionsWithoutVelocities( 1.f/solverdt );
+
 }
 
 
@@ -1030,96 +1025,136 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 // Kernel dispatches
 void btOpenCLSoftBodySolver::prepareLinks()
 {
-	prepareLinksKernel.kernel.setArg(0, m_linkData.getNumLinks());
-	prepareLinksKernel.kernel.setArg(1, m_linkData.m_clLinks.getBuffer());
-	prepareLinksKernel.kernel.setArg(2, m_linkData.m_clLinksMassLSC.getBuffer());
-	prepareLinksKernel.kernel.setArg(3, m_vertexData.m_clVertexPreviousPosition.getBuffer());
-	prepareLinksKernel.kernel.setArg(4, m_linkData.m_clLinksLengthRatio.getBuffer());
-	prepareLinksKernel.kernel.setArg(5, m_linkData.m_clLinksCLength.getBuffer());
 
-	int	numWorkItems = workGroupSize*((m_linkData.getNumLinks() + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(prepareLinksKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
+	cl_int ciErrNum;
+	int numLinks = m_linkData.getNumLinks();
+	ciErrNum = clSetKernelArg(prepareLinksKernel,0, sizeof(int), &numLinks);
+	ciErrNum = clSetKernelArg(prepareLinksKernel,1, sizeof(cl_mem), &m_linkData.m_clLinks.m_buffer);
+	ciErrNum = clSetKernelArg(prepareLinksKernel,2, sizeof(cl_mem), &m_linkData.m_clLinksMassLSC.m_buffer);
+	ciErrNum = clSetKernelArg(prepareLinksKernel,3, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
+	ciErrNum = clSetKernelArg(prepareLinksKernel,4, sizeof(cl_mem), &m_linkData.m_clLinksLengthRatio.m_buffer);
+	ciErrNum = clSetKernelArg(prepareLinksKernel,5, sizeof(cl_mem), &m_linkData.m_clLinksCLength.m_buffer);
+
+	size_t	numWorkItems = workGroupSize*((m_linkData.getNumLinks() + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,prepareLinksKernel, 1 , NULL, &numWorkItems, &workGroupSize,0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(prepareLinksKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(prepareLinksKernel)");
 	}
+
 }
 
 void btOpenCLSoftBodySolver::updatePositionsFromVelocities( float solverdt )
 {
-	updatePositionsFromVelocitiesKernel.kernel.setArg(0, m_vertexData.getNumVertices());
-	updatePositionsFromVelocitiesKernel.kernel.setArg(1, solverdt);
-	updatePositionsFromVelocitiesKernel.kernel.setArg(2, m_vertexData.m_clVertexVelocity.getBuffer());
-	updatePositionsFromVelocitiesKernel.kernel.setArg(3, m_vertexData.m_clVertexPreviousPosition.getBuffer());
-	updatePositionsFromVelocitiesKernel.kernel.setArg(4, m_vertexData.m_clVertexPosition.getBuffer());
 
-	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(updatePositionsFromVelocitiesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
+	cl_int ciErrNum;
+	int numVerts = m_vertexData.getNumVertices();
+	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,0, sizeof(int), &numVerts);
+	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,1, sizeof(float), &solverdt);
+	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,2, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
+	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,3, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
+	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,4, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
+
+	size_t	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,updatePositionsFromVelocitiesKernel, 1, NULL, &numWorkItems,&workGroupSize,0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(updatePositionsFromVelocitiesKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(updatePositionsFromVelocitiesKernel)");
 	}
+
 }
 
 void btOpenCLSoftBodySolver::solveLinksForPosition( int startLink, int numLinks, float kst, float ti )
 {
-	solvePositionsFromLinksKernel.kernel.setArg(0, startLink);
-	solvePositionsFromLinksKernel.kernel.setArg(1, numLinks);
-	solvePositionsFromLinksKernel.kernel.setArg(2, kst);
-	solvePositionsFromLinksKernel.kernel.setArg(3, ti);
-	solvePositionsFromLinksKernel.kernel.setArg(4, m_linkData.m_clLinks.getBuffer());
-	solvePositionsFromLinksKernel.kernel.setArg(5, m_linkData.m_clLinksMassLSC.getBuffer());
-	solvePositionsFromLinksKernel.kernel.setArg(6, m_linkData.m_clLinksRestLengthSquared.getBuffer());
-	solvePositionsFromLinksKernel.kernel.setArg(7, m_vertexData.m_clVertexInverseMass.getBuffer());
-	solvePositionsFromLinksKernel.kernel.setArg(8, m_vertexData.m_clVertexPosition.getBuffer());
 
-	int	numWorkItems = workGroupSize*((numLinks + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(solvePositionsFromLinksKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
+	cl_int ciErrNum;
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,0, sizeof(int), &startLink);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,1, sizeof(int), &numLinks);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,2, sizeof(float), &kst);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,3, sizeof(float), &ti);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,4, sizeof(cl_mem), &m_linkData.m_clLinks.m_buffer);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,5, sizeof(cl_mem), &m_linkData.m_clLinksMassLSC.m_buffer);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,6, sizeof(cl_mem), &m_linkData.m_clLinksRestLengthSquared.m_buffer);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,7, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,8, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
+
+	size_t	numWorkItems = workGroupSize*((numLinks + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,solvePositionsFromLinksKernel,1,NULL,&numWorkItems,&workGroupSize,0,0,0);
+	if( ciErrNum!= CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(solvePositionsFromLinksKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(solvePositionsFromLinksKernel)");
 	}
+
 } // solveLinksForPosition
 
 
+void btOpenCLSoftBodySolver::solveLinksForVelocity( int startLink, int numLinks, float kst )
+{
+
+	cl_int ciErrNum;
+	ciErrNum = clSetKernelArg(vSolveLinksKernel, 0, sizeof(int), &startLink);
+	ciErrNum = clSetKernelArg(vSolveLinksKernel, 1, sizeof(int), &numLinks);
+	ciErrNum = clSetKernelArg(vSolveLinksKernel, 2, sizeof(cl_mem), &m_linkData.m_clLinks.m_buffer);
+	ciErrNum = clSetKernelArg(vSolveLinksKernel, 3, sizeof(cl_mem), &m_linkData.m_clLinksLengthRatio.m_buffer);
+	ciErrNum = clSetKernelArg(vSolveLinksKernel, 4, sizeof(cl_mem), &m_linkData.m_clLinksCLength.m_buffer);
+	ciErrNum = clSetKernelArg(vSolveLinksKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
+	ciErrNum = clSetKernelArg(vSolveLinksKernel, 6, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
+
+	size_t	numWorkItems = workGroupSize*((numLinks + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,vSolveLinksKernel,1,NULL,&numWorkItems, &workGroupSize,0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
+	{
+		btAssert( 0 &&  "enqueueNDRangeKernel(vSolveLinksKernel)");
+	}
+
+}
+
 void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithVelocities( float isolverdt )
 {
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(0, m_vertexData.getNumVertices());
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(1, isolverdt);
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(2, m_vertexData.m_clVertexPosition.getBuffer());
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(3, m_vertexData.m_clVertexPreviousPosition.getBuffer());
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(4, m_vertexData.m_clClothIdentifier.getBuffer());
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(5, m_clPerClothVelocityCorrectionCoefficient.getBuffer());
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(6, m_clPerClothDampingFactor.getBuffer());
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(7, m_vertexData.m_clVertexVelocity.getBuffer());
-	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(8, m_vertexData.m_clVertexForceAccumulator.getBuffer());
 
-	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(updateVelocitiesFromPositionsWithVelocitiesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
+	cl_int ciErrNum;
+	int numVerts = m_vertexData.getNumVertices();
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel,0, sizeof(int), &numVerts);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 1, sizeof(float), &isolverdt);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 2, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 4, sizeof(cl_mem), &m_vertexData.m_clClothIdentifier.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 5, sizeof(cl_mem), &m_clPerClothVelocityCorrectionCoefficient.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 6, sizeof(cl_mem), &m_clPerClothDampingFactor.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 7, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 8, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);
+
+	size_t	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,updateVelocitiesFromPositionsWithVelocitiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(updateVelocitiesFromPositionsWithVelocitiesKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(updateVelocitiesFromPositionsWithVelocitiesKernel)");
 	}
 
+
 } // updateVelocitiesFromPositionsWithVelocities
 
 void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities( float isolverdt )
 {
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(0, m_vertexData.getNumVertices());
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(1, isolverdt);
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(2, m_vertexData.m_clVertexPosition.getBuffer());
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(3, m_vertexData.m_clVertexPreviousPosition.getBuffer());
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(4, m_vertexData.m_clClothIdentifier.getBuffer());
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(5, m_clPerClothDampingFactor.getBuffer());
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(6, m_vertexData.m_clVertexVelocity.getBuffer());
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(7, m_vertexData.m_clVertexForceAccumulator.getBuffer());
 
-	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
-	if( err != CL_SUCCESS ) 
+	cl_int ciErrNum;
+	int numVerts = m_vertexData.getNumVertices();
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 0, sizeof(int), &numVerts);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 1, sizeof(float), &isolverdt);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 2, sizeof(cl_mem),&m_vertexData.m_clVertexPosition.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 3, sizeof(cl_mem),&m_vertexData.m_clVertexPreviousPosition.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 4, sizeof(cl_mem),&m_vertexData.m_clClothIdentifier.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 5, sizeof(cl_mem),&m_clPerClothDampingFactor.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 6, sizeof(cl_mem),&m_vertexData.m_clVertexVelocity.m_buffer);
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 7, sizeof(cl_mem),&m_vertexData.m_clVertexForceAccumulator.m_buffer);
+
+	size_t	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,updateVelocitiesFromPositionsWithoutVelocitiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(updateVelocitiesFromPositionsWithoutVelocitiesKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(updateVelocitiesFromPositionsWithoutVelocitiesKernel)");
 	}
+
 } // updateVelocitiesFromPositionsWithoutVelocities
 
 // End kernel dispatches
@@ -1133,15 +1168,20 @@ void btOpenCLSoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * cons
 	// and use them together on a single kernel call if possible by setting up a
 	// per-cloth target buffer array for the copy kernel.
 
-	btAcceleratedSoftBodyInterface *currentCloth = findSoftBodyInterface( softBody );
+
+	btOpenCLAcceleratedSoftBodyInterface *currentCloth = findSoftBodyInterface( softBody );
+
+	const int firstVertex = currentCloth->getFirstVertex();
+	const int lastVertex = firstVertex + currentCloth->getNumVertices();
 
 	if( vertexBuffer->getBufferType() == btVertexBufferDescriptor::CPU_BUFFER )
 	{		
-		const int firstVertex = currentCloth->getFirstVertex();
-		const int lastVertex = firstVertex + currentCloth->getNumVertices();
 		const btCPUVertexBufferDescriptor *cpuVertexBuffer = static_cast< btCPUVertexBufferDescriptor* >(vertexBuffer);						
 		float *basePointer = cpuVertexBuffer->getBasePointer();						
 
+		m_vertexData.m_clVertexPosition.copyFromGPU();
+		m_vertexData.m_clVertexNormal.copyFromGPU();
+
 		if( vertexBuffer->hasVertexPositions() )
 		{
 			const int vertexOffset = cpuVertexBuffer->getVertexOffset();
@@ -1173,43 +1213,46 @@ void btOpenCLSoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * cons
 			}
 		}
 	}
+
 } // btCPUSoftBodySolver::outputToVertexBuffers
 
 
-btOpenCLSoftBodySolver::KernelDesc btOpenCLSoftBodySolver::compileCLKernelFromString( const char *shaderString, const char *shaderName )
+cl_kernel btOpenCLSoftBodySolver::compileCLKernelFromString( const char* kernelSource, const char* kernelName )
 {
-	cl_int err;
+	printf("compiling kernalName: %s ",kernelName);
+	cl_kernel kernel;
+	cl_int ciErrNum;
+	size_t program_length = strlen(kernelSource);
 
-	context = m_queue.getInfo<CL_QUEUE_CONTEXT>();
-	device = m_queue.getInfo<CL_QUEUE_DEVICE>();
-	std::vector< cl::Device > devices;
-	devices.push_back( device );
+	cl_program m_cpProgram = clCreateProgramWithSource(m_cxMainContext, 1, (const char**)&kernelSource, &program_length, &ciErrNum);
+//	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+		
+    // Build the program with 'mad' Optimization option
+#ifdef MAC
+	char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
+#else
+	const char* flags = "-DGUID_ARG=";
+#endif
+    ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, flags, NULL, NULL);
+    if (ciErrNum != CL_SUCCESS)
+    {
+        printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
+		btAssert(0);
+        exit(0);
+    }
+	
+    // Create the kernel
+    kernel = clCreateKernel(m_cpProgram, kernelName, &ciErrNum);
+    if (ciErrNum != CL_SUCCESS)
+    {
+        printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
+		btAssert(0);
+		exit(0);
+    }
 
-	cl::Program::Sources source(1, std::make_pair(shaderString, strlen(shaderString) + 1));
-	cl::Program program(context, source, &err);
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert( "program" );
-	}
-	err = program.build(devices);
-	if (err != CL_SUCCESS) {
-		//std::string str;
-		//str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
-		//std::cout << "Program Info: " << str;
-		if( err != CL_SUCCESS ) 
-		{
-			btAssert( "Program::build()" );
-		}
-	}
-	cl::Kernel kernel(program, shaderName, &err);
-	if( err != CL_SUCCESS ) 
-	{
-		btAssert( "kernel" );
-	}
+	printf("ready. \n");
+	return kernel;
 
-	KernelDesc descriptor;
-	descriptor.kernel = kernel;
-	return descriptor;
 }
 
 void btOpenCLSoftBodySolver::predictMotion( float timeStep )
@@ -1234,11 +1277,11 @@ void btOpenCLSoftBodySolver::predictMotion( float timeStep )
 
 
 
-btOpenCLSoftBodySolver::btAcceleratedSoftBodyInterface *btOpenCLSoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
+btOpenCLAcceleratedSoftBodyInterface *btOpenCLSoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
 {
 	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
 	{
-		btAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
+		btOpenCLAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
 		if( softBodyInterface->getSoftBody() == softBody )
 			return softBodyInterface;
 	}
@@ -1273,4 +1316,4 @@ bool btOpenCLSoftBodySolver::buildShaders()
 		m_shadersInitialized = true;
 
 	return returnVal;
-}
\ No newline at end of file
+}
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
index 84d208488..b023d475c 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
@@ -16,204 +16,165 @@ subject to the following restrictions:
 #ifndef BT_SOFT_BODY_SOLVER_OPENCL_H
 #define BT_SOFT_BODY_SOLVER_OPENCL_H
 
+#include "stddef.h" //for size_t
 #include "vectormath/vmInclude.h"
+
 #include "BulletSoftBody/btSoftBodySolvers.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
+#include "btSoftBodySolverLinkData_OpenCL.h"
+#include "btSoftBodySolverVertexData_OpenCL.h"
+#include "btSoftBodySolverTriangleData_OpenCL.h"
+
+
+/**
+ * SoftBody class to maintain information about a soft body instance
+ * within a solver.
+ * This data addresses the main solver arrays.
+ */
+class btOpenCLAcceleratedSoftBodyInterface
+{
+protected:
+	/** Current number of vertices that are part of this cloth */
+	int m_numVertices;
+	/** Maximum number of vertices allocated to be part of this cloth */
+	int m_maxVertices;
+	/** Current number of triangles that are part of this cloth */
+	int m_numTriangles;
+	/** Maximum number of triangles allocated to be part of this cloth */
+	int m_maxTriangles;
+	/** Index of first vertex in the world allocated to this cloth */
+	int m_firstVertex;
+	/** Index of first triangle in the world allocated to this cloth */
+	int m_firstTriangle;
+	/** Index of first link in the world allocated to this cloth */
+	int m_firstLink;
+	/** Maximum number of links allocated to this cloth */
+	int m_maxLinks;
+	/** Current number of links allocated to this cloth */
+	int m_numLinks;
+
+	/** The actual soft body this data represents */
+	btSoftBody *m_softBody;
+
+
+public:
+	btOpenCLAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
+	  m_softBody( softBody )
+	{
+		m_numVertices = 0;
+		m_maxVertices = 0;
+		m_numTriangles = 0;
+		m_maxTriangles = 0;
+		m_firstVertex = 0;
+		m_firstTriangle = 0;
+		m_firstLink = 0;
+		m_maxLinks = 0;
+		m_numLinks = 0;
+	}
+	int getNumVertices()
+	{
+		return m_numVertices;
+	}
+
+	int getNumTriangles()
+	{
+		return m_numTriangles;
+	}
+
+	int getMaxVertices()
+	{
+		return m_maxVertices;
+	}
+
+	int getMaxTriangles()
+	{
+		return m_maxTriangles;
+	}
+
+	int getFirstVertex()
+	{
+		return m_firstVertex;
+	}
+
+	int getFirstTriangle()
+	{
+		return m_firstTriangle;
+	}
+
+	// TODO: All of these set functions will have to do checks and
+	// update the world because restructuring of the arrays will be necessary
+	// Reasonable use of "friend"?
+	void setNumVertices( int numVertices )
+	{
+		m_numVertices = numVertices;
+	}	
+	
+	void setNumTriangles( int numTriangles )
+	{
+		m_numTriangles = numTriangles;
+	}
+
+	void setMaxVertices( int maxVertices )
+	{
+		m_maxVertices = maxVertices;
+	}
+
+	void setMaxTriangles( int maxTriangles )
+	{
+		m_maxTriangles = maxTriangles;
+	}
+
+	void setFirstVertex( int firstVertex )
+	{
+		m_firstVertex = firstVertex;
+	}
+
+	void setFirstTriangle( int firstTriangle )
+	{
+		m_firstTriangle = firstTriangle;
+	}
+
+	void setMaxLinks( int maxLinks )
+	{
+		m_maxLinks = maxLinks;
+	}
+
+	void setNumLinks( int numLinks )
+	{
+		m_numLinks = numLinks;
+	}
+
+	void setFirstLink( int firstLink )
+	{
+		m_firstLink = firstLink;
+	}
+
+	int getMaxLinks()
+	{
+		return m_maxLinks;
+	}
+
+	int getNumLinks()
+	{
+		return m_numLinks;
+	}
+
+	int getFirstLink()
+	{
+		return m_firstLink;
+	}
+
+	btSoftBody* getSoftBody()
+	{
+		return m_softBody;
+	}
+
+};
 
 
 class btOpenCLSoftBodySolver : public btSoftBodySolver
 {
 private:
-	/**
-	 * SoftBody class to maintain information about a soft body instance
-	 * within a solver.
-	 * This data addresses the main solver arrays.
-	 */
-	class btAcceleratedSoftBodyInterface
-	{
-	protected:
-		/** Current number of vertices that are part of this cloth */
-		int m_numVertices;
-		/** Maximum number of vertices allocated to be part of this cloth */
-		int m_maxVertices;
-		/** Current number of triangles that are part of this cloth */
-		int m_numTriangles;
-		/** Maximum number of triangles allocated to be part of this cloth */
-		int m_maxTriangles;
-		/** Index of first vertex in the world allocated to this cloth */
-		int m_firstVertex;
-		/** Index of first triangle in the world allocated to this cloth */
-		int m_firstTriangle;
-		/** Index of first link in the world allocated to this cloth */
-		int m_firstLink;
-		/** Maximum number of links allocated to this cloth */
-		int m_maxLinks;
-		/** Current number of links allocated to this cloth */
-		int m_numLinks;
-
-		/** The actual soft body this data represents */
-		btSoftBody *m_softBody;
-
-
-	public:
-		btAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
-		  m_softBody( softBody )
-		{
-			m_numVertices = 0;
-			m_maxVertices = 0;
-			m_numTriangles = 0;
-			m_maxTriangles = 0;
-			m_firstVertex = 0;
-			m_firstTriangle = 0;
-			m_firstLink = 0;
-			m_maxLinks = 0;
-			m_numLinks = 0;
-		}
-		int getNumVertices()
-		{
-			return m_numVertices;
-		}
-
-		int getNumTriangles()
-		{
-			return m_numTriangles;
-		}
-
-		int getMaxVertices()
-		{
-			return m_maxVertices;
-		}
-
-		int getMaxTriangles()
-		{
-			return m_maxTriangles;
-		}
-
-		int getFirstVertex()
-		{
-			return m_firstVertex;
-		}
-
-		int getFirstTriangle()
-		{
-			return m_firstTriangle;
-		}
-
-		// TODO: All of these set functions will have to do checks and
-		// update the world because restructuring of the arrays will be necessary
-		// Reasonable use of "friend"?
-		void setNumVertices( int numVertices )
-		{
-			m_numVertices = numVertices;
-		}	
-		
-		void setNumTriangles( int numTriangles )
-		{
-			m_numTriangles = numTriangles;
-		}
-
-		void setMaxVertices( int maxVertices )
-		{
-			m_maxVertices = maxVertices;
-		}
-
-		void setMaxTriangles( int maxTriangles )
-		{
-			m_maxTriangles = maxTriangles;
-		}
-
-		void setFirstVertex( int firstVertex )
-		{
-			m_firstVertex = firstVertex;
-		}
-
-		void setFirstTriangle( int firstTriangle )
-		{
-			m_firstTriangle = firstTriangle;
-		}
-
-		void setMaxLinks( int maxLinks )
-		{
-			m_maxLinks = maxLinks;
-		}
-
-		void setNumLinks( int numLinks )
-		{
-			m_numLinks = numLinks;
-		}
-
-		void setFirstLink( int firstLink )
-		{
-			m_firstLink = firstLink;
-		}
-
-		int getMaxLinks()
-		{
-			return m_maxLinks;
-		}
-
-		int getNumLinks()
-		{
-			return m_numLinks;
-		}
-
-		int getFirstLink()
-		{
-			return m_firstLink;
-		}
-
-		btSoftBody* getSoftBody()
-		{
-			return m_softBody;
-		}
-
-	#if 0
-		void setAcceleration( Vectormath::Aos::Vector3 acceleration )
-		{
-			m_currentSolver->setPerClothAcceleration( m_clothIdentifier, acceleration );
-		}
-
-		void setWindVelocity( Vectormath::Aos::Vector3 windVelocity )
-		{
-			m_currentSolver->setPerClothWindVelocity( m_clothIdentifier, windVelocity );
-		}
-
-		/** 
-		 * Set the density of the air in which the cloth is situated.
-		 */
-		void setAirDensity( btScalar density )
-		{
-			m_currentSolver->setPerClothMediumDensity( m_clothIdentifier, static_cast<float>(density) );
-		}
-
-		/**
-		 * Add a collision object to this soft body.
-		 */
-		void addCollisionObject( btCollisionObject *collisionObject )
-		{
-			m_currentSolver->addCollisionObjectForSoftBody( m_clothIdentifier, collisionObject );
-		}
-	#endif
-	};
-
-
-	class KernelDesc
-	{
-	protected:
-	public:
-		cl::Kernel kernel;
-
-		KernelDesc()
-		{
-		}
-
-		virtual ~KernelDesc()
-		{
-		}
-	}; 
 
 	btSoftBodyLinkDataOpenCL m_linkData;
 	btSoftBodyVertexDataOpenCL m_vertexData;
@@ -228,7 +189,7 @@ private:
 	 * Cloths owned by this solver.
 	 * Only our cloths are in this array.
 	 */
-	btAlignedObjectArray< btAcceleratedSoftBodyInterface * > m_softBodySet;
+	btAlignedObjectArray< btOpenCLAcceleratedSoftBodyInterface * > m_softBodySet;
 
 	/** Acceleration value to be applied to all non-static vertices in the solver. 
 	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
@@ -262,37 +223,34 @@ private:
 	btAlignedObjectArray< float >						m_perClothMediumDensity;
 	btOpenCLBuffer<float>								m_clPerClothMediumDensity;
 
-	KernelDesc		prepareLinksKernel;
-	KernelDesc		solvePositionsFromLinksKernel;
-	KernelDesc		updateConstantsKernel;
-	KernelDesc		integrateKernel;
-	KernelDesc		addVelocityKernel;
-	KernelDesc		updatePositionsFromVelocitiesKernel;
-	KernelDesc		updateVelocitiesFromPositionsWithoutVelocitiesKernel;
-	KernelDesc		updateVelocitiesFromPositionsWithVelocitiesKernel;
-	KernelDesc		vSolveLinksKernel;
-	KernelDesc		resetNormalsAndAreasKernel;
-	KernelDesc		normalizeNormalsAndAreasKernel;
-	KernelDesc		updateSoftBodiesKernel;
-	KernelDesc		outputToVertexArrayWithNormalsKernel;
-	KernelDesc		outputToVertexArrayWithoutNormalsKernel;
+	cl_kernel		prepareLinksKernel;
+	cl_kernel		solvePositionsFromLinksKernel;
+	cl_kernel		updateConstantsKernel;
+	cl_kernel		integrateKernel;
+	cl_kernel		addVelocityKernel;
+	cl_kernel		updatePositionsFromVelocitiesKernel;
+	cl_kernel		updateVelocitiesFromPositionsWithoutVelocitiesKernel;
+	cl_kernel		updateVelocitiesFromPositionsWithVelocitiesKernel;
+	cl_kernel		vSolveLinksKernel;
+	cl_kernel		resetNormalsAndAreasKernel;
+	cl_kernel		normalizeNormalsAndAreasKernel;
+	cl_kernel		updateSoftBodiesKernel;
+	cl_kernel		outputToVertexArrayWithNormalsKernel;
+	cl_kernel		outputToVertexArrayWithoutNormalsKernel;
 
-	KernelDesc		outputToVertexArrayKernel;
-	KernelDesc		applyForcesKernel;
-	KernelDesc		collideSphereKernel;
-	KernelDesc		collideCylinderKernel;
+	cl_kernel		outputToVertexArrayKernel;
+	cl_kernel		applyForcesKernel;
+	cl_kernel		collideSphereKernel;
+	cl_kernel		collideCylinderKernel;
 
-	static const int workGroupSize = 128;
-
-	cl::CommandQueue m_queue;
-	cl::Context context;
-	cl::Device device;
+	cl_command_queue	m_cqCommandQue;
+	cl_context			m_cxMainContext;
 
 
 	/**
-	 * Compile a compute shader kernel from a string and return the appropriate KernelDesc object.
+	 * Compile a compute shader kernel from a string and return the appropriate cl_kernel object.
 	 */
-	KernelDesc compileCLKernelFromString( const char *shaderString, const char *shaderName );
+	cl_kernel compileCLKernelFromString( const char *shaderString, const char *shaderName );
 
 	bool buildShaders();
 
@@ -306,7 +264,7 @@ private:
 
 	void ApplyClampedForce( float solverdt, const Vectormath::Aos::Vector3 &force, const Vectormath::Aos::Vector3 &vertexVelocity, float inverseMass, Vectormath::Aos::Vector3 &vertexForce );
 	
-	btAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
+	btOpenCLAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
 
 	virtual void applyForces( float solverdt );
 
@@ -342,7 +300,7 @@ private:
 	
 
 public:
-	btOpenCLSoftBodySolver(const cl::CommandQueue &queue);
+	btOpenCLSoftBodySolver(cl_command_queue queue,cl_context	ctx);
 
 	virtual ~btOpenCLSoftBodySolver();
 
@@ -371,4 +329,4 @@ public:
 	virtual void copySoftBodyToVertexBuffer( const btSoftBody *const softBody, btVertexBufferDescriptor *vertexBuffer );
 }; // btOpenCLSoftBodySolver
 
-#endif #ifndef BT_SOFT_BODY_SOLVER_OPENCL_H
\ No newline at end of file
+#endif #ifndef BT_SOFT_BODY_SOLVER_OPENCL_H
diff --git a/src/BulletSoftBody/btSoftBodySolvers.h b/src/BulletSoftBody/btSoftBodySolvers.h
index 24a742e49..79a55f706 100644
--- a/src/BulletSoftBody/btSoftBodySolvers.h
+++ b/src/BulletSoftBody/btSoftBodySolvers.h
@@ -23,7 +23,6 @@ class btSoftBodyTriangleData;
 class btSoftBodyLinkData;
 class btSoftBodyVertexData;
 class btVertexBufferDescriptor;
-class btAcceleratedSoftBodyInterface;
 class btCollisionObject;
 class btSoftBody;
 
diff --git a/src/MiniCL/MiniCL.cpp b/src/MiniCL/MiniCL.cpp
index 9cb1ca331..1e0823a1c 100644
--- a/src/MiniCL/MiniCL.cpp
+++ b/src/MiniCL/MiniCL.cpp
@@ -30,6 +30,7 @@ subject to the following restrictions:
 //#define DEBUG_MINICL_KERNELS 1
 
 static char* spPlatformID = "MiniCL, SCEA";
+static char* spDriverVersion= "1.0";
 
 CL_API_ENTRY cl_int CL_API_CALL clGetPlatformIDs(
 	cl_uint           num_entries,
@@ -91,23 +92,24 @@ CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(
 	cl_device_info          param_name ,
 	size_t                  param_value_size ,
 	void *                  param_value ,
-	size_t *                /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0
+	size_t *                param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
 {
 
 	switch (param_name)
 	{
 	case CL_DEVICE_NAME:
 		{
-			char deviceName[] = "CPU";
+			char deviceName[] = "MiniCL CPU";
 			unsigned int nameLen = strlen(deviceName)+1;
 			btAssert(param_value_size>strlen(deviceName));
 			if (nameLen < param_value_size)
 			{
-				const char* cpuName = "CPU";
+				const char* cpuName = "MiniCL CPU";
 				sprintf((char*)param_value,"%s",cpuName);
 			} else
 			{
 				printf("error: param_value_size should be at least %d, but it is %d\n",nameLen,param_value_size);
+				return CL_INVALID_VALUE; 
 			}
 			break;
 		}
@@ -120,6 +122,7 @@ CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(
 			} else
 			{
 				printf("error: param_value_size should be at least %d\n",sizeof(cl_device_type));
+				return CL_INVALID_VALUE; 
 			}
 			break;
 		}
@@ -132,6 +135,7 @@ CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(
 			} else
 			{
 				printf("error: param_value_size should be at least %d\n",sizeof(cl_uint));
+				return CL_INVALID_VALUE; 
 			}
 
 			break;
@@ -149,6 +153,7 @@ CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(
 			} else
 			{
 				printf("error: param_value_size should be at least %d\n",sizeof(cl_uint));
+				return CL_INVALID_VALUE; 
 			}
 			break;
 		}
@@ -158,6 +163,142 @@ CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(
 			 *clock_frequency = 3*1024;
 			break;
 		}
+
+	case CL_DEVICE_VENDOR	:
+		{
+			if(param_value_size < (strlen(spPlatformID) + 1))
+			{
+				return CL_INVALID_VALUE; 
+			}
+			strcpy((char*)param_value, spPlatformID);
+			if(param_value_size_ret != NULL)
+			{
+				*param_value_size_ret = strlen(spPlatformID) + 1;
+			}
+			break;
+		}
+	case CL_DRIVER_VERSION:
+		{
+			if(param_value_size < (strlen(spDriverVersion) + 1))
+			{
+				return CL_INVALID_VALUE; 
+			}
+			strcpy((char*)param_value, spDriverVersion);
+			if(param_value_size_ret != NULL)
+			{
+				*param_value_size_ret = strlen(spDriverVersion) + 1;
+			}
+
+			break;
+		}
+	case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:
+		{
+			 cl_uint* maxDimensions = (cl_uint*)param_value;
+			 *maxDimensions = 1;
+			 break;
+		}
+		case CL_DEVICE_MAX_WORK_GROUP_SIZE:
+		{
+			 cl_uint* maxWorkGroupSize = (cl_uint*)param_value;
+			 *maxWorkGroupSize = 128;//1;
+			 break;
+		}
+		case CL_DEVICE_ADDRESS_BITS:
+		{
+			 cl_uint* addressBits = (cl_uint*)param_value;
+			 *addressBits= 32; //@todo: should this be 64 for 64bit builds?
+			 break;
+		}
+		case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
+			{
+				cl_ulong* maxMemAlloc = (cl_ulong*)param_value;
+				*maxMemAlloc= 512*1024*1024; //this "should be enough for everyone" ?
+			 break;
+			}
+		case CL_DEVICE_GLOBAL_MEM_SIZE:
+			{
+				cl_ulong* maxMemAlloc = (cl_ulong*)param_value;
+				*maxMemAlloc= 1024*1024*1024; //this "should be enough for everyone" ?
+			 break;
+			}
+
+		case CL_DEVICE_ERROR_CORRECTION_SUPPORT:
+			{
+			cl_bool* error_correction_support = (cl_bool*)param_value;
+			*error_correction_support = CL_FALSE;
+			break;
+			}
+
+		case CL_DEVICE_LOCAL_MEM_TYPE:
+			{
+			cl_device_local_mem_type* local_mem_type = (cl_device_local_mem_type*)param_value;
+			*local_mem_type = CL_GLOBAL;
+			break;
+			}
+		case CL_DEVICE_LOCAL_MEM_SIZE:
+			{
+				cl_ulong* localmem = (cl_ulong*) param_value;
+				*localmem = 32*1024;
+				break;
+			}
+
+		case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:
+			{
+				cl_ulong* localmem = (cl_ulong*) param_value;
+				*localmem = 64*1024;
+				break;
+			}
+		case CL_DEVICE_QUEUE_PROPERTIES:
+			{
+				cl_command_queue_properties* queueProp = (cl_command_queue_properties*) param_value;
+				memset(queueProp,0,param_value_size);
+
+				break;
+			}
+		case CL_DEVICE_IMAGE_SUPPORT:
+			{
+				cl_bool* imageSupport = (cl_bool*) param_value;
+				*imageSupport = CL_FALSE;
+				break;
+			}
+
+		case CL_DEVICE_MAX_WRITE_IMAGE_ARGS:
+		case CL_DEVICE_MAX_READ_IMAGE_ARGS:
+			{
+				cl_uint* imageArgs = (cl_uint*) param_value;
+				*imageArgs = 0;
+				break;
+			}
+		case CL_DEVICE_IMAGE3D_MAX_DEPTH:
+		case CL_DEVICE_IMAGE3D_MAX_HEIGHT:
+		case CL_DEVICE_IMAGE2D_MAX_HEIGHT:
+		case CL_DEVICE_IMAGE3D_MAX_WIDTH:
+		case CL_DEVICE_IMAGE2D_MAX_WIDTH:
+			{
+				size_t* maxSize = (size_t*) param_value;
+				*maxSize = 0;
+				break;
+			}
+
+		case CL_DEVICE_EXTENSIONS:
+			{
+				char* extensions = (char*) param_value;
+				*extensions = 0;
+				break;
+			}
+
+		case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
+		case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:
+		case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:
+		case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:
+		case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
+		case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
+			{
+				cl_uint* width  = (cl_uint*) param_value;
+				*width = 1;
+				break;
+			}
+			
 	default:
 		{
 			printf("error: unsupported param_name:%d\n",param_name);
@@ -486,7 +627,7 @@ extern CL_API_ENTRY cl_int CL_API_CALL clGetContextInfo(cl_context         /* co
 }
 
 CL_API_ENTRY cl_context CL_API_CALL clCreateContextFromType(cl_context_properties * /* properties */,
-                        cl_device_type          /* device_type */,
+                        cl_device_type           device_type ,
                         void (*pfn_notify)(const char *, const void *, size_t, void *) /* pfn_notify */,
                         void *                  /* user_data */,
                         cl_int *                 errcode_ret ) CL_API_SUFFIX__VERSION_1_0
@@ -502,14 +643,18 @@ CL_API_ENTRY cl_context CL_API_CALL clCreateContextFromType(cl_context_propertie
 		"MiniCL_0", "MiniCL_1", "MiniCL_2", "MiniCL_3", "MiniCL_4", "MiniCL_5", "MiniCL_6", "MiniCL_7" 
 	};
 
-#ifdef DEBUG_MINICL_KERNELS
-	SequentialThreadSupport::SequentialThreadConstructionInfo stc("MiniCL",processMiniCLTask,createMiniCLLocalStoreMemory);
-	SequentialThreadSupport* threadSupport = new SequentialThreadSupport(stc);
-#else
+	btThreadSupportInterface* threadSupport = 0;
+
+	if (device_type==CL_DEVICE_TYPE_DEBUG)
+	{
+		SequentialThreadSupport::SequentialThreadConstructionInfo stc("MiniCL",processMiniCLTask,createMiniCLLocalStoreMemory);
+		threadSupport = new SequentialThreadSupport(stc);
+	} else
+	{
 
 #if _WIN32
 	btAssert(sUniqueThreadSupportIndex < maxNumOfThreadSupports);
-	Win32ThreadSupport* threadSupport = new Win32ThreadSupport(Win32ThreadSupport::Win32ThreadConstructionInfo(
+	threadSupport = new Win32ThreadSupport(Win32ThreadSupport::Win32ThreadConstructionInfo(
 //								"MiniCL",
 								sUniqueThreadSupportName[sUniqueThreadSupportIndex++],
 								processMiniCLTask, //processCollisionTask,
@@ -518,10 +663,10 @@ CL_API_ENTRY cl_context CL_API_CALL clCreateContextFromType(cl_context_propertie
 #else
 	///todo: add posix thread support for other platforms
 	SequentialThreadSupport::SequentialThreadConstructionInfo stc("MiniCL",processMiniCLTask,createMiniCLLocalStoreMemory);
-	SequentialThreadSupport* threadSupport = new SequentialThreadSupport(stc);
+	threadSupport = new SequentialThreadSupport(stc);
 #endif
 
-#endif //DEBUG_MINICL_KERNELS
+	}
 	
 	
 	MiniCLTaskScheduler* scheduler = new MiniCLTaskScheduler(threadSupport,maxNumOutstandingTasks);
diff --git a/src/MiniCL/cl.h b/src/MiniCL/cl.h
index b0cda4237..053491ee2 100644
--- a/src/MiniCL/cl.h
+++ b/src/MiniCL/cl.h
@@ -155,8 +155,10 @@ typedef struct _cl_image_format {
 #define CL_DEVICE_TYPE_CPU                          (1 << 1)
 #define CL_DEVICE_TYPE_GPU                          (1 << 2)
 #define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_DEBUG						(1 << 4)
 #define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
 
+
 // cl_device_info
 #define CL_DEVICE_TYPE                              0x1000
 #define CL_DEVICE_VENDOR_ID                         0x1001
diff --git a/src/MiniCL/cl_MiniCL_Defs.h b/src/MiniCL/cl_MiniCL_Defs.h
index ffdac1026..4a7a84526 100644
--- a/src/MiniCL/cl_MiniCL_Defs.h
+++ b/src/MiniCL/cl_MiniCL_Defs.h
@@ -140,6 +140,8 @@ static float4 operator+(const float4& a,const float4& b)
 	return tmp;
 }
 
+
+
 static float4 operator-(const float4& a,const float4& b)
 {
 	float4 tmp;
@@ -159,6 +161,17 @@ static float4 operator*(float a,const float4& b)
 	return tmp;
 }
 
+static float4 operator/(const float4& b,float a)
+{
+	float4 tmp;
+	tmp.x = b.x/a;
+	tmp.y = b.y/a;
+	tmp.z = b.z/a;
+	tmp.w = b.w/a;
+	return tmp;
+}
+
+
 
 static float dot(const float4&a ,const float4& b)
 {
@@ -170,6 +183,22 @@ static float dot(const float4&a ,const float4& b)
 	return tmp.x+tmp.y+tmp.z+tmp.w;
 }
 
+static float length(const float4&a)
+{
+	float l = sqrtf(a.x*a.x+a.y*a.y+a.z*a.z);
+	return l;
+}
+
+static float4 normalize(const float4&a)
+{
+	float4 tmp;
+	float l = length(a);
+	tmp = 1.f/l*a;
+	return tmp;
+}
+
+
+
 static float4 cross(const float4&a ,const float4& b)
 {
 	float4 tmp;