added OpenCL cloth demo, contributed by AMD.

updated GpuSoftBodySolvers updated DirectCompute cloth demo
2010-08-14 00:56:17 +00:00
parent 40958f2b4a
commit 4f9b450200
72 changed files with 7524 additions and 843 deletions
--- a/Demos/CMakeLists.txt
+++ b/Demos/CMakeLists.txt
@@ -14,7 +14,7 @@ IF(BUILD_CPU_DEMOS)
 		CollisionInterfaceDemo ConcaveConvexcastDemo SimplexDemo DynamicControlDemo
 		DoublePrecisionDemo ConcaveDemo CollisionDemo
 		ContinuousConvexCollision ConcaveRaycastDemo GjkConvexCastDemo
-		MultiMaterialDemo SerializeDemo InternalEdgeDemo
+		MultiMaterialDemo SerializeDemo InternalEdgeDemo 
 	)
 ELSE()
 	SET(SharedDemoSubdirs
@@ -28,6 +28,7 @@ ENDIF()
 		MultiThreadedDemo
 		VectorAdd_OpenCL
 		ParticlesOpenCL
 		OpenCLClothDemo
 		)
 ELSE (USE_GLUT)
--- a/Demos/DX11ClothDemo/btDirectComputeSupport.h
+++ b/Demos/DX11ClothDemo/btDirectComputeSupport.h
@@ -1,6 +1,6 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+Copyright (c) 2010 Advanced Micro Devices
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
@@ -13,6 +13,8 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 #ifndef BT_DIRECT_COMPUTE_SUPPORT_HPP
 #define BT_DIRECT_COMPUTE_SUPPORT_HPP
--- a/Demos/DX11ClothDemo/cap.h
+++ b/Demos/DX11ClothDemo/cap.h
@@ -1,3 +1,18 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2010 Advanced Micro Devices
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 class cap 
 {
--- a/Demos/DX11ClothDemo/cloth.h
+++ b/Demos/DX11ClothDemo/cloth.h
@@ -1,4 +1,22 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2010 Advanced Micro Devices
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #include <fstream>
 #include <iostream>
 #include <iomanip>
 class piece_of_cloth 
 {
@@ -171,7 +189,8 @@ public:
 			pd3dImmediateContext->PSSetShaderResources(0,1,&texture2D_view);
-			pd3dImmediateContext->DrawIndexed( (width*3*2+2 + height*width*3*2), 0, ( UINT )pSubset->VertexStart );
+			//pd3dImmediateContext->DrawIndexed( (width*3*2+2 + height*width*3*2), 0, ( UINT )pSubset->VertexStart );
 			pd3dImmediateContext->DrawIndexed( ((height-1)*(width-1)*3*2), 0, ( UINT )pSubset->VertexStart );
 		}
 		SAFE_RELEASE(pd3dImmediateContext);
@@ -246,7 +265,7 @@ public:
 		//unsigned int indices[] = {0,1,2, 1,3,2};
-		unsigned int* indices = new unsigned int[width*3*2+2 + height*width*3*2];
+		unsigned int* indices = new unsigned int[(height-1)*(width-1)*3*2];
 		for(int y = 0; y < height-1; y++)
 		{
@@ -265,7 +284,8 @@ public:
 			}
 		}
-		bufferDesc.ByteWidth = sizeof(unsigned int)*(width*3*2+2 + height*width*3*2);
+
 		bufferDesc.ByteWidth = sizeof(unsigned int)*((height-1)*(width-1)*3*2);
 		bufferDesc.BindFlags = D3D11_BIND_INDEX_BUFFER;
 		InitData.pSysMem = indices;
--- a/Demos/DX11ClothDemo/cloth_renderer.cpp
+++ b/Demos/DX11ClothDemo/cloth_renderer.cpp
@@ -32,18 +32,15 @@ class btDX11SIMDAwareSoftBodySolver;
 #include "BulletSoftBody/btSoftBodySolvers.h"
 #include "BulletSoftBody/btDefaultSoftBodySolver.h"
 #include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolver_CPU.h"
 //#include "BulletSoftBody/Solvers/CPU/btAcceleratedSoftBody_CPUVertexSolver.h"
 #include "BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h"
-//#include "BulletSoftBody/Solvers/DX11/btAcceleratedSoftBody_DX11SIMDAwareSolver.h"
+#include "BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.h"
 //#include "BulletSoftBody/btAcceleratedSoftBody_DXVertexBuffers.h"
 #include "BulletSoftBody/btSoftBodyRigidBodyCollisionConfiguration.h"
-//#define USE_SIMDAWARE_SOLVER
+#define USE_SIMDAWARE_SOLVER
-#define USE_GPU_SOLVER
+//#define USE_GPU_SOLVER
 //#define USE_VERTEX_SOLVER
 #define USE_GPU_COPY
-const int numFlags = 2;
+const int numFlags = 5;
 const int clothWidth = 40;
 const int clothHeight = 60;//60;
 float _windAngle = 1.0;//0.4;
@@ -206,6 +203,7 @@ btSoftRigidDynamicsWorld* m_dynamicsWorld;
 btDefaultSoftBodySolver *g_defaultSolver = NULL;
 btCPUSoftBodySolver *g_cpuSolver = NULL;
 btDX11SoftBodySolver *g_dx11Solver = NULL;
 btDX11SIMDAwareSoftBodySolver *g_dx11SIMDSolver = NULL;
 btSoftBodySolver *g_solver = NULL;
@@ -454,12 +452,17 @@ void initBullet(void)
 #ifdef USE_GPU_SOLVER
 	g_dx11Solver = new btDX11SoftBodySolver( g_pd3dDevice, DXUTGetD3D11DeviceContext() );
 	g_solver = g_dx11Solver;
 #else
 #ifdef USE_SIMDAWARE_SOLVER
 	g_dx11SIMDSolver = new btDX11SIMDAwareSoftBodySolver( g_pd3dDevice, DXUTGetD3D11DeviceContext() );
 	g_solver = g_dx11SIMDSolver;
 #else
 	g_cpuSolver = new btCPUSoftBodySolver;
 	g_solver = g_cpuSolver;
 	//g_defaultSolver = new btDefaultSoftBodySolver;
 	//g_solver = g_defaultSolver;
 #endif
 #endif
@@ -1260,6 +1263,9 @@ void CALLBACK OnD3D11DestroyDevice( void* pUserContext )
 		delete g_cpuSolver;
 	if( g_dx11Solver )
 		delete g_dx11Solver;
 	if( g_dx11SIMDSolver )
 		delete g_dx11SIMDSolver;
 	for(int i=0; i< m_collisionShapes.size(); i++)
 		delete m_collisionShapes[i];
--- a/Demos/DX11ClothDemo/cylinder.h
+++ b/Demos/DX11ClothDemo/cylinder.h
@@ -1,3 +1,18 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2010 Advanced Micro Devices
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 class cylinder 
 {
--- a/Demos/OpenCLClothDemo/AMD/CMakeLists.txt
+++ b/Demos/OpenCLClothDemo/AMD/CMakeLists.txt
@@ -0,0 +1,102 @@
 INCLUDE_DIRECTORIES(
 ${BULLET_PHYSICS_SOURCE_DIR}/src 
 ${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL
 ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL
 )
 ADD_DEFINITIONS(-DUSE_AMD_OPENCL)
 ADD_DEFINITIONS(-DCL_PLATFORM_AMD)
 IF (INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 	INCLUDE_DIRECTORIES(		$ENV{==ATISTREAMSDKROOT=}/include )
 	IF (CMAKE_CL_64)
 		SET(CMAK_ATISTREAMSDK_LIBPATH 		$ENV{==ATISTREAMSDKROOT=}/lib/x86_64		)
 	ELSE(CMAKE_CL_64)
 		SET(CMAK_ATISTREAMSDK_LIBPATH		$ENV{==ATISTREAMSDKROOT=}/lib/x86		)
 	ENDIF(CMAKE_CL_64)
 ELSE()
 	INCLUDE_DIRECTORIES(		$ENV{ATISTREAMSDKROOT}/include	)
 	IF (CMAKE_CL_64)
 		SET(CMAK_ATISTREAMSDK_LIBPATH 		$ENV{ATISTREAMSDKROOT}/lib/x86_64 )
 	ELSE(CMAKE_CL_64)
 		SET(CMAK_ATISTREAMSDK_LIBPATH		$ENV{ATISTREAMSDKROOT}/lib/x86		)
 	ENDIF(CMAKE_CL_64)
 ENDIF()
 IF (CMAKE_CL_64)
 	SET(CMAK_GLEW_LIBRARY
 		${BULLET_PHYSICS_SOURCE_DIR}/Glut/glew64.lib		)
 ELSE(CMAKE_CL_64)
 	SET(CMAK_GLEW_LIBRARY		${BULLET_PHYSICS_SOURCE_DIR}/Glut/glew32.lib		)
 ENDIF(CMAKE_CL_64)
 IF (USE_GLUT)
 	LINK_LIBRARIES(
 		OpenGLSupport 
 		BulletSoftBodySolvers_OpenCL_AMD
 		BulletSoftBodySolvers_CPU
 		BulletMultiThreaded
 		BulletSoftBody
 		BulletDynamics  
 		BulletCollision  
 		LinearMath 
 		${GLUT_glut_LIBRARY} 
 		${OPENGL_gl_LIBRARY} 
 		${OPENGL_glu_LIBRARY}
 		${CMAK_GLEW_LIBRARY}
 		${CMAK_ATISTREAMSDK_LIBPATH}/OpenCL.lib
 	)
 	ADD_EXECUTABLE(AppOpenCLClothDemo_AMD
 		../cl_cloth_demo.cpp
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.h
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.h
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.cpp
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.cpp
 		../gl_win.cpp
 		../clstuff.cpp
 		../bmpLoader.cpp
 		../bmpLoader.h
 		../clstuff.h
 		../gl_win.h
 	)
 ELSE (USE_GLUT)
 ENDIF (USE_GLUT)
 IF(WIN32)
 IF (CMAKE_CL_64)
 	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_AMD		POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR}	
 					)
 		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_AMD	POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW64.DLL ${CMAKE_CURRENT_BINARY_DIR})
 	ENDIF()
 ELSE(CMAKE_CL_64)
 	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_AMD	POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR}
 					)
 		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_AMD	POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW32.DLL ${CMAKE_CURRENT_BINARY_DIR})
 	ENDIF()
 ENDIF(CMAKE_CL_64)
 ENDIF(WIN32)
 ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_AMD	POST_BUILD
 			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/amdFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
 			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/atiFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
 			)
 IF (UNIX)
  TARGET_LINK_LIBRARIES(AppOpenCLClothDemo_AMD pthread)
 ENDIF(UNIX)
--- a/Demos/OpenCLClothDemo/Apple/CMakeLists.txt
+++ b/Demos/OpenCLClothDemo/Apple/CMakeLists.txt
@@ -0,0 +1,60 @@
 INCLUDE_DIRECTORIES(
 ${BULLET_PHYSICS_SOURCE_DIR}/src 
 ${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL
 ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL
 )
 IF (APPLE)
  FIND_LIBRARY(OPENCL_LIBRARY OpenCL DOC "OpenCL lib for OSX")
  FIND_PATH(OPENCL_INCLUDE_DIR OpenCL/cl.h DOC "Include for OpenCL on OSX")
 ENDIF (APPLE)
 IF (USE_GLUT)
 	LINK_LIBRARIES(
 		OpenGLSupport 
 		BulletSoftBodySolvers_OpenCL_Apple
 		BulletSoftBodySolvers_CPU
 		BulletMultiThreaded
 		BulletSoftBody
 		BulletDynamics  
 		BulletCollision  
 		LinearMath
 		${OPENCL_LIBRARY}
 		${GLUT_glut_LIBRARY} 
 		${OPENGL_gl_LIBRARY} 
 		${OPENGL_glu_LIBRARY}
 		${CMAK_GLEW_LIBRARY}
 	)
 	ADD_EXECUTABLE(AppOpenCLClothDemo_Apple
 		../cl_cloth_demo.cpp
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.h
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.h
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.cpp
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.cpp
 		../gl_win.cpp
 		../clstuff.cpp
 		../bmpLoader.cpp
 		../bmpLoader.h
 		../clstuff.h
 		../gl_win.h
 	)
 ELSE (USE_GLUT)
 ENDIF (USE_GLUT)
 ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_Apple	POST_BUILD
 			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/amdFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
 			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/atiFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
 			)
 IF (UNIX)
  TARGET_LINK_LIBRARIES(AppOpenCLClothDemo_Apple pthread)
 ENDIF(UNIX)
--- a/Demos/OpenCLClothDemo/CLClothDemo.sln
+++ b/Demos/OpenCLClothDemo/CLClothDemo.sln
@@ -0,0 +1,20 @@
 Microsoft Visual Studio Solution File, Format Version 10.00
 # Visual Studio 2008
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CLClothDemo", "CLClothDemo.vcproj", "{A61906AF-B5DE-454E-99F6-B653C250D221}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
 		Release|Win32 = Release|Win32
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{A61906AF-B5DE-454E-99F6-B653C250D221}.Debug|Win32.ActiveCfg = Debug|Win32
 		{A61906AF-B5DE-454E-99F6-B653C250D221}.Debug|Win32.Build.0 = Debug|Win32
 		{A61906AF-B5DE-454E-99F6-B653C250D221}.Release|Win32.ActiveCfg = Release|Win32
 		{A61906AF-B5DE-454E-99F6-B653C250D221}.Release|Win32.Build.0 = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Demos/OpenCLClothDemo/CLClothDemo.vcproj
+++ b/Demos/OpenCLClothDemo/CLClothDemo.vcproj
@@ -0,0 +1,233 @@
 <?xml version="1.0" encoding="Windows-1252"?>
 <VisualStudioProject
 	ProjectType="Visual C++"
 	Version="9.00"
 	Name="CLClothDemo"
 	ProjectGUID="{A61906AF-B5DE-454E-99F6-B653C250D221}"
 	RootNamespace="CLClothDemo"
 	Keyword="Win32Proj"
 	TargetFrameworkVersion="196613"
 	>
 	<Platforms>
 		<Platform
 			Name="Win32"
 		/>
 	</Platforms>
 	<ToolFiles>
 	</ToolFiles>
 	<Configurations>
 		<Configuration
 			Name="Debug|Win32"
 			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
 			IntermediateDirectory="$(ConfigurationName)"
 			ConfigurationType="1"
 			CharacterSet="1"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
 			/>
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
 			<Tool
 				Name="VCWebServiceProxyGeneratorTool"
 			/>
 			<Tool
 				Name="VCMIDLTool"
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
 				AdditionalIncludeDirectories="S:\SVN\GpuClothAMD\Bullet\BulletTrunk\Glut;&quot;C:\Program Files (x86)\ATI Stream\include&quot;;..\..\..\projects\physics\Bullet\BulletTrunk\src;S:\SVN\GpuClothAMD\Bullet\BulletTrunk\src"
 				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="3"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
 				DebugInformationFormat="4"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCPreLinkEventTool"
 			/>
 			<Tool
 				Name="VCLinkerTool"
 				LinkLibraryDependencies="false"
 				AdditionalDependencies="glew32.lib OpenCL.lib ..\..\lib\Debug\BulletDynamics.lib ..\..\lib\Debug\BulletCollision.lib ..\..\lib\Debug\LinearMath.lib ..\..\lib\Debug\BulletSoftBody.lib ..\..\lib\Debug\BulletSoftBodySolvers_CPU.lib ..\..\lib\Debug\BulletSoftBodySolvers_OpenCL.lib"
 				LinkIncremental="1"
 				AdditionalLibraryDirectories="&quot;C:\Program Files (x86)\ATI Stream\lib\x86&quot;;S:\SVN\GpuClothAMD\Bullet\BulletTrunk\Glut;S:\SVN\GpuClothAMD\Bullet\BulletTrunk\lib\Debug"
 				GenerateDebugInformation="true"
 				SubSystem="1"
 				ImportLibrary="S:\SVN\GpuClothAMD\Bullet\BulletTrunk\Demos\DX11ClothDemo\Debug\AppDX11ClothDemo.lib"
 				TargetMachine="0"
 			/>
 			<Tool
 				Name="VCALinkTool"
 			/>
 			<Tool
 				Name="VCManifestTool"
 			/>
 			<Tool
 				Name="VCXDCMakeTool"
 			/>
 			<Tool
 				Name="VCBscMakeTool"
 			/>
 			<Tool
 				Name="VCFxCopTool"
 			/>
 			<Tool
 				Name="VCAppVerifierTool"
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
 		<Configuration
 			Name="Release|Win32"
 			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
 			IntermediateDirectory="$(ConfigurationName)"
 			ConfigurationType="1"
 			CharacterSet="1"
 			WholeProgramOptimization="1"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
 			/>
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
 			<Tool
 				Name="VCWebServiceProxyGeneratorTool"
 			/>
 			<Tool
 				Name="VCMIDLTool"
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="2"
 				EnableIntrinsicFunctions="true"
 				AdditionalIncludeDirectories="S:\SVN\GpuClothAMD\Bullet\BulletTrunk\Glut;&quot;C:\Program Files (x86)\ATI Stream\include&quot;;..\..\..\projects\physics\Bullet\BulletTrunk\src;S:\SVN\GpuClothAMD\Bullet\BulletTrunk\src"
 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
 				RuntimeLibrary="2"
 				EnableFunctionLevelLinking="true"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
 				DebugInformationFormat="3"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCPreLinkEventTool"
 			/>
 			<Tool
 				Name="VCLinkerTool"
 				AdditionalDependencies="glew32.lib OpenCL.lib BulletSoftBody.lib BulletDynamics.lib BulletCollision.lib LinearMath.lib BulletSoftBodySolvers_CPU.lib BulletSoftBodySolvers_OpenCL.lib"
 				LinkIncremental="1"
 				AdditionalLibraryDirectories="..\Bullet\BulletTrunk\lib\Release\;&quot;C:\Program Files (x86)\ATI Stream\lib\x86&quot;;S:\SVN\GpuClothAMD\Bullet\BulletTrunk\Glut;S:\SVN\GpuClothAMD\Bullet\BulletTrunk\lib\Release"
 				GenerateDebugInformation="true"
 				SubSystem="1"
 				OptimizeReferences="2"
 				EnableCOMDATFolding="2"
 				TargetMachine="1"
 			/>
 			<Tool
 				Name="VCALinkTool"
 			/>
 			<Tool
 				Name="VCManifestTool"
 			/>
 			<Tool
 				Name="VCXDCMakeTool"
 			/>
 			<Tool
 				Name="VCBscMakeTool"
 			/>
 			<Tool
 				Name="VCFxCopTool"
 			/>
 			<Tool
 				Name="VCAppVerifierTool"
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
 	</Configurations>
 	<References>
 	</References>
 	<Files>
 		<Filter
 			Name="Source Files"
 			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
 			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
 			>
 			<File
 				RelativePath=".\bmpLoader.cpp"
 				>
 			</File>
 			<File
 				RelativePath=".\cl_cloth_demo.cpp"
 				>
 			</File>
 			<File
 				RelativePath=".\clstuff.cpp"
 				>
 			</File>
 			<File
 				RelativePath=".\gl_win.cpp"
 				>
 			</File>
 		</Filter>
 		<Filter
 			Name="Header Files"
 			Filter="h;hpp;hxx;hm;inl;inc;xsd"
 			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
 			>
 			<File
 				RelativePath=".\bmpLoader.hpp"
 				>
 			</File>
 			<File
 				RelativePath=".\btOpenCLSupport.h"
 				>
 			</File>
 			<File
 				RelativePath=".\cloth.h"
 				>
 			</File>
 			<File
 				RelativePath=".\clstuff.hpp"
 				>
 			</File>
 			<File
 				RelativePath=".\gl_win.hpp"
 				>
 			</File>
 		</Filter>
 		<Filter
 			Name="Resource Files"
 			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
 			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
 			>
 		</Filter>
 	</Files>
 	<Globals>
 	</Globals>
 </VisualStudioProject>
--- a/Demos/OpenCLClothDemo/CMakeLists.txt
+++ b/Demos/OpenCLClothDemo/CMakeLists.txt
@@ -0,0 +1,15 @@
 IF(BUILD_MINICL_OPENCL_DEMOS)
 	SUBDIRS( MiniCL  )
 ENDIF()
 IF(BUILD_AMD_OPENCL_DEMOS)
 	SUBDIRS(AMD)
 ENDIF()
 IF(BUILD_NVIDIA_OPENCL_DEMOS)
 	SUBDIRS(NVidia)
 ENDIF()
 IF(APPLE)
 	SUBDIRS(Apple)
 ENDIF()
--- a/Demos/OpenCLClothDemo/MiniCL/CMakeLists.txt
+++ b/Demos/OpenCLClothDemo/MiniCL/CMakeLists.txt
@@ -0,0 +1,86 @@
 INCLUDE_DIRECTORIES(
 ${BULLET_PHYSICS_SOURCE_DIR}/src 
 ${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL
 ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL
 )
 ADD_DEFINITIONS(-DUSE_MINICL)
 IF (WIN32)
 	IF (CMAKE_CL_64)
 		SET(CMAK_GLEW_LIBRARY
 			${BULLET_PHYSICS_SOURCE_DIR}/Glut/glew64.lib		)
 	ELSE(CMAKE_CL_64)
 		SET(CMAK_GLEW_LIBRARY		${BULLET_PHYSICS_SOURCE_DIR}/Glut/glew32.lib		)
 	ENDIF(CMAKE_CL_64)
 ENDIF()
 IF (USE_GLUT)
 	LINK_LIBRARIES(
 		OpenGLSupport 
 		BulletSoftBodySolvers_OpenCL_Mini
 		BulletSoftBodySolvers_CPU
 		MiniCL
 		BulletMultiThreaded
 		BulletSoftBody
 		BulletDynamics  
 		BulletCollision  
 		LinearMath 
 		${GLUT_glut_LIBRARY} 
 		${OPENGL_gl_LIBRARY} 
 		${OPENGL_glu_LIBRARY}
 		${CMAK_GLEW_LIBRARY}
 	)
 	ADD_EXECUTABLE(AppOpenCLClothDemo_Mini
 		../cl_cloth_demo.cpp
 		../gl_win.cpp
 		../clstuff.cpp
 		../bmpLoader.cpp
 		../bmpLoader.h
 		../clstuff.h
 		../gl_win.h
 		${BULLET_PHYSICS_SOURCE_DIR}/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.h
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.h
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.cpp
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.cpp
 	)
 ELSE (USE_GLUT)
 ENDIF (USE_GLUT)
 IF(WIN32)
 IF (CMAKE_CL_64)
 	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_Mini		POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR}	
 					)
 		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_Mini	POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW64.DLL ${CMAKE_CURRENT_BINARY_DIR})
 	ENDIF()
 ELSE(CMAKE_CL_64)
 	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_Mini	POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR}
 					)
 		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_Mini	POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW32.DLL ${CMAKE_CURRENT_BINARY_DIR})
 	ENDIF()
 ENDIF(CMAKE_CL_64)
 ENDIF(WIN32)
 ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_Mini	POST_BUILD
 			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/amdFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
 			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/atiFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
 			)
 IF (UNIX)
  TARGET_LINK_LIBRARIES(AppOpenCLClothDemo_Mini pthread)
 ENDIF(UNIX)
--- a/Demos/OpenCLClothDemo/NVidia/CMakeLists.txt
+++ b/Demos/OpenCLClothDemo/NVidia/CMakeLists.txt
@@ -0,0 +1,102 @@
 INCLUDE_DIRECTORIES(
 ${BULLET_PHYSICS_SOURCE_DIR}/src 
 ${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL
 ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL
 )
 IF(INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 	INCLUDE_DIRECTORIES( $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/inc	)
 	IF (CMAKE_CL_64)
 		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/x64    )
 	ELSE(CMAKE_CL_64)
 		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/Win32	)
 	ENDIF(CMAKE_CL_64)
 ELSE()
 	INCLUDE_DIRECTORIES( $ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/inc	)
 	IF (CMAKE_CL_64)
 		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/lib/x64 )
 	ELSE(CMAKE_CL_64)
 		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/lib/Win32	)
 	ENDIF(CMAKE_CL_64)
 ENDIF()
 IF (CMAKE_CL_64)
 	SET(CMAK_GLEW_LIBRARY
 		${BULLET_PHYSICS_SOURCE_DIR}/Glut/glew64.lib		)
 ELSE(CMAKE_CL_64)
 	SET(CMAK_GLEW_LIBRARY		${BULLET_PHYSICS_SOURCE_DIR}/Glut/glew32.lib		)
 ENDIF(CMAKE_CL_64)
 IF (USE_GLUT)
 	LINK_LIBRARIES(
 		OpenGLSupport 
 		BulletSoftBodySolvers_OpenCL_NVidia
 		BulletSoftBodySolvers_CPU
 		BulletMultiThreaded
 		BulletSoftBody
 		BulletDynamics  
 		BulletCollision  
 		LinearMath 
 		${GLUT_glut_LIBRARY} 
 		${OPENGL_gl_LIBRARY} 
 		${OPENGL_glu_LIBRARY}
 		${CMAK_GLEW_LIBRARY}
 		${CMAK_NVSDKCOMPUTE_LIBPATH}/OpenCL.lib
 	)
 	ADD_EXECUTABLE(AppOpenCLClothDemo_NVidia
 		../cl_cloth_demo.cpp
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.h
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.h
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.cpp
 		${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.cpp
 		../gl_win.cpp
 		../clstuff.cpp
 		../bmpLoader.cpp
 		../bmpLoader.h
 		../clstuff.h
 		../gl_win.h
 	)
 ELSE (USE_GLUT)
 ENDIF (USE_GLUT)
 IF(WIN32)
 IF (CMAKE_CL_64)
 	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_NVidia		POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR}	
 					)
 		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_NVidia	POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW64.DLL ${CMAKE_CURRENT_BINARY_DIR})
 	ENDIF()
 ELSE(CMAKE_CL_64)
 	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_NVidia	POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR}
 					)
 		ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_NVidia	POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW32.DLL ${CMAKE_CURRENT_BINARY_DIR})
 	ENDIF()
 ENDIF(CMAKE_CL_64)
 ENDIF(WIN32)
 ADD_CUSTOM_COMMAND(	TARGET AppOpenCLClothDemo_NVidia	POST_BUILD
 			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/amdFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
 			COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenCLClothDemo/atiFlag.bmp ${CMAKE_CURRENT_BINARY_DIR}
 			)
 IF (UNIX)
  TARGET_LINK_LIBRARIES(AppOpenCLClothDemo_NVidia pthread)
 ENDIF(UNIX)
--- a/Demos/OpenCLClothDemo/amdFlag.bmp
+++ b/Demos/OpenCLClothDemo/amdFlag.bmp
--- a/Demos/OpenCLClothDemo/atiFlag.bmp
+++ b/Demos/OpenCLClothDemo/atiFlag.bmp
--- a/Demos/OpenCLClothDemo/bmpLoader.cpp
+++ b/Demos/OpenCLClothDemo/bmpLoader.cpp
@@ -0,0 +1,325 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2010 Advanced Micro Devices
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #include "bmpLoader.h"
 #include <new>
 #include <cstring>
 #include <cstdio>
 namespace amd
 {
 static const short bitMapID = 19778;
 void
 BitMap::releaseResources(void)
 {
    if (pixels_ != NULL) {
        delete[] pixels_;
    }
    if (colors_ != NULL) {
        delete[] colors_;
    }
    pixels_    = NULL;
    colors_    = NULL;
    isLoaded_  = false;
 }
 BitMap& BitMap::operator=(const BitMap& rhs)
 {
    if (this == &rhs) {
        return *this;
    }
    // Copy header
    id         = rhs.id;
    size       = rhs.size;
    reserved1  = rhs.reserved1;
    reserved2  = rhs.reserved2;
    offset     = rhs.offset;
    // Copy header info
    sizeInfo       = rhs.sizeInfo;
    width          = rhs.width;
    height         = rhs.height;
    planes         = rhs.planes;
    bitsPerPixel   = rhs.bitsPerPixel;
    compression    = rhs.compression;
    imageSize      = rhs.imageSize;
    xPelsPerMeter  = rhs.xPelsPerMeter;
    yPelsPerMeter  = rhs.yPelsPerMeter;
    clrUsed        = rhs.clrUsed;
    clrImportant   = rhs.clrImportant;
    numColors_ = rhs.numColors_;
    isLoaded_  = rhs.isLoaded_;
    pixels_    = NULL;
    colors_    = NULL;
    if (isLoaded_) {
        if (rhs.colors_ != NULL) {
            colors_ = new ColorPalette[numColors_];
            if (colors_ == NULL) {
                isLoaded_ = false;
                return *this;
            }
            memcpy(colors_, rhs.colors_, numColors_ * sizeof(ColorPalette));
         }
        pixels_ = new uchar4[width * height];
        if (pixels_ == NULL) {
            delete[] colors_;
            colors_   = NULL;
            isLoaded_ = false;
            return *this;
        }
        memcpy(pixels_, rhs.pixels_, width * height * sizeof(uchar4));
    }
    return *this;
 }
 void
 BitMap::load(const char * filename)
 {
    // Release any existing resources
    releaseResources();
    // Open BMP file
    FILE * fd = fopen(filename, "rb");
    // Opened OK
    if (fd != NULL) {
        // Read header
        fread((BitMapHeader *)this, sizeof(BitMapHeader), 1, fd);
        // Failed to read header
        if (ferror(fd)) {
            fclose(fd);
            return;
        }
        // Confirm that we have a bitmap file
        if (id != bitMapID) {
            fclose(fd);
            return;
        }
        // Read map info header
        fread((BitMapInfoHeader *)this, sizeof(BitMapInfoHeader), 1, fd);
        // Failed to read map info header
        if (ferror(fd)) {
            fclose(fd);
            return;
        }
        // No support for compressed images
        if (compression) {
            fclose(fd);
            return;
        }
        // Support only 8 or 24 bits images
        if (bitsPerPixel < 8) {
            fclose(fd);
            return;
        }
        // Store number of colors
        numColors_ = 1 << bitsPerPixel;
        //load the palate for 8 bits per pixel
        if(bitsPerPixel == 8) {
            colors_ = new ColorPalette[numColors_];
            if (colors_ == NULL) {
                fclose(fd);
                return;
            }
            fread(
                (char *)colors_,
                numColors_ * sizeof(ColorPalette),
                1,
                fd);
            // Failed to read colors
            if (ferror(fd)) {
                fclose(fd);
                return;
            }
        }
        // Allocate buffer to hold all pixels
        unsigned int sizeBuffer = size - offset;
        unsigned char * tmpPixels = new unsigned char[sizeBuffer];
        if (tmpPixels == NULL) {
            delete colors_;
            colors_ = NULL;
            fclose(fd);
            return;
        }
        // Read pixels from file, including any padding
        fread(tmpPixels, sizeBuffer * sizeof(unsigned char), 1, fd);
        // Failed to read pixel data
        if (ferror(fd)) {
            delete colors_;
            colors_ = NULL;
            delete tmpPixels;
            fclose(fd);
            return;
        }
        // Allocate image
        pixels_ = new uchar4[width * height];
        if (pixels_ == NULL) {
            delete colors_;
            colors_ = NULL;
            delete tmpPixels;
            fclose(fd);
            return;
        }
        // Set image, including w component (white)
        memset(pixels_, 0xff, width * height * sizeof(uchar4));
        unsigned int index = 0;
        for(int y = 0; y < height; y++) {
            for(int x = 0; x < width; x++) {
                // Read RGB values
                if (bitsPerPixel == 8) {
                    pixels_[(y * width + x)] = colors_[tmpPixels[index++]];
                }
                else { // 24 bit
                    pixels_[(y * width + x)].z = tmpPixels[index++];
                    pixels_[(y * width + x)].y = tmpPixels[index++];
                    pixels_[(y * width + x)].x = tmpPixels[index++];
                }
            }
            // Handle padding
            for(int x = 0; x < (4 - (3 * width) % 4) % 4; x++) {
                index++;
            }
        }
        // Loaded file so we can close the file.
        fclose(fd);
        delete[] tmpPixels;
        // Loaded file so record this fact
        isLoaded_  = true;
    }
 }
 int
 BitMap::colorIndex(uchar4 color)
 {
    for (int i = 0; i < numColors_; i++) {
        if (colors_[i].x == color.x &&
            colors_[i].y == color.y &&
            colors_[i].z == color.z &&
            colors_[i].w == color.w) {
            return i;
        }
    }
    return 0;
 }
 bool
 BitMap::write(const char * filename)
 {
    if (!isLoaded_) {
        return false;
    }
    // Open BMP file
    FILE * fd = fopen(filename, "wb");
    // Opened OK
    if (fd != NULL) {
        // Write header
        fwrite((BitMapHeader *)this, sizeof(BitMapHeader), 1, fd);
        // Failed to write header
        if (ferror(fd)) {
            fclose(fd);
            return false;
        }
        // Write map info header
        fwrite((BitMapInfoHeader *)this, sizeof(BitMapInfoHeader), 1, fd);
        // Failed to write map info header
        if (ferror(fd)) {
            fclose(fd);
            return false;
        }
        // Write palate for 8 bits per pixel
        if(bitsPerPixel == 8) {
            fwrite(
                (char *)colors_,
                numColors_ * sizeof(ColorPalette),
                1,
                fd);
            // Failed to write colors
            if (ferror(fd)) {
                fclose(fd);
                return false;
            }
        }
        for(int y = 0; y < height; y++) {
            for(int x = 0; x < width; x++) {
                // Read RGB values
                if (bitsPerPixel == 8) {
                    fputc(
                        colorIndex(
                            pixels_[(y * width + x)]),
                            fd);
                }
                else { // 24 bit
                    fputc(pixels_[(y * width + x)].z, fd);
                    fputc(pixels_[(y * width + x)].y, fd);
                    fputc(pixels_[(y * width + x)].x, fd);
                    if (ferror(fd)) {
                        fclose(fd);
                        return false;
                    }
                }
            }
            // Add padding
            for(int x = 0; x < (4 - (3 * width) % 4) % 4; x++) {
                fputc(0, fd);
            }
        }
        return true;
    }
    return false;
 }
 } // amd
--- a/Demos/OpenCLClothDemo/bmpLoader.h
+++ b/Demos/OpenCLClothDemo/bmpLoader.h
@@ -0,0 +1,201 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2010 Advanced Micro Devices
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #ifndef BMPLOADER_H_
 #define BMPLOADER_H_
 #include <cstdlib>
 #include <iostream>
 namespace amd
 {
 //! @fixme this needs to be moved to common types header?
 #pragma pack(1)
 typedef struct
 {
    unsigned char x;
    unsigned char y;
    unsigned char z;
    unsigned char w;
 } uchar4;
 typedef uchar4 ColorPalette;
 //! \struct Bitmap header info
 typedef struct {
    short id;
    int size;
    short reserved1;
    short reserved2;
    int offset;
 } BitMapHeader;
 //! \struct Bitmap info header
 typedef struct {
    int sizeInfo;
    int width;
    int height;
    short planes;
    short bitsPerPixel;
    unsigned compression;
    unsigned imageSize;
    int xPelsPerMeter;
    int yPelsPerMeter;
    int clrUsed;
    int clrImportant;
 } BitMapInfoHeader;
 //! \class Bitmap used to load a bitmap image from a file.
 class BitMap : public BitMapHeader, public BitMapInfoHeader
 {
 private:
    uchar4 * pixels_;
    int numColors_;
    ColorPalette * colors_;
    bool isLoaded_;
    void releaseResources(void);
    int colorIndex(uchar4 color);
 public:
    //! \brief Default constructor
    BitMap()
        : pixels_(NULL),
          numColors_(0),
          colors_(NULL),
          isLoaded_(false)
    {}
    /*!\brief Constructor
     *
     * Tries to load bitmap image from filename provided.
     *
     * \param filename pointer to null terminated string that is the path and
     * filename to the bitmap image to be loaded.
     *
     * In the base of an error, e.g. the bitmap file could not be loaded for
     * some reason, then a following call to isLoaded will return false.
     */
    BitMap(const char * filename)
        : pixels_(NULL),
          numColors_(0),
          colors_(NULL),
          isLoaded_(false)
    {
        load(filename);
    }
    /*! \brief Copy constructor
     *
     * \param rhs is the bitmap to be copied (cloned).
     */
    BitMap(const BitMap& rhs)
    {
        *this = rhs;
    }
    //! \brief Destructor
    ~BitMap()
    {
        releaseResources();
    }
    /*! \brief Assignment
     * \param rhs is the bitmap to be assigned (cloned).
     */
    BitMap& operator=(const BitMap& rhs);
    /*! \brief Load Bitmap image
     *
     * \param filename is a pointer to a null terminated string that is the
     * path and filename name to the the bitmap file to be loaded.
     *
     * In the base of an error, e.g. the bitmap file could not be loaded for
     * some reason, then a following call to isLoaded will return false.
     */
    void
    load(const char * filename);
    /*! \brief Write Bitmap image
     *
     * \param filename is a pointer to a null terminated string that is the
     * path and filename name to the the bitmap file to be written.
     *
     * \return In the case that the bitmap is written true is returned. In
     * the case that a bitmap image is not already loaded or the write fails
     * for some reason false is returned.
     */
    bool
    write(const char * filename);
    /*! \brief Get image width
     *
     * \return If a bitmap image has been successfully loaded, then the width
     * image is returned, otherwise -1;
     */
    int
    getWidth(void) const
    {
        if (isLoaded_) {
            return width;
        }
        else {
            return -1;
        }
    }
    /*! \brief Get image height
     *
     * \return If a bitmap image has been successfully loaded, then the height
     * image is returned, otherwise -1.
     */
    int
    getHeight(void) const
    {
        if (isLoaded_) {
            return height;
        }
        else {
            return -1;
        }
    }
    /*! \brief Get image width
     *
     * \return If a bitmap image has been successfully loaded, then returns
     * a pointer to image's pixels, otherwise NULL.
     */
    const uchar4 *
    getPixels(void) const { return pixels_; }
    /*! \brief Is an image currently loaded
     *
     * \return If a bitmap image has been successfully loaded, then returns
     * true, otherwise if an image could not be loaded or an image has yet
     * to be loaded false is returned.
     */
    bool
    isLoaded(void) const { return isLoaded_; }
 };
 #pragma pack()
 }
 #endif // BMPLOADER_H_
--- a/Demos/OpenCLClothDemo/bmpLoader.hpp
+++ b/Demos/OpenCLClothDemo/bmpLoader.hpp
@@ -0,0 +1,189 @@
 //
 // Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
 //
 #ifndef BMPLOADER_H_
 #define BMPLOADER_H_
 #include <cstdlib>
 #include <iostream>
 namespace amd
 {
 //! @fixme this needs to be moved to common types header?
 #pragma pack(1)
 typedef struct
 {
    unsigned char x;
    unsigned char y;
    unsigned char z;
    unsigned char w;
 } uchar4;
 typedef uchar4 ColorPalette;
 //! \struct Bitmap header info
 typedef struct {
    short id;
    int size;
    short reserved1;
    short reserved2;
    int offset;
 } BitMapHeader;
 //! \struct Bitmap info header
 typedef struct {
    int sizeInfo;
    int width;
    int height;
    short planes;
    short bitsPerPixel;
    unsigned compression;
    unsigned imageSize;
    int xPelsPerMeter;
    int yPelsPerMeter;
    int clrUsed;
    int clrImportant;
 } BitMapInfoHeader;
 //! \class Bitmap used to load a bitmap image from a file.
 class BitMap : public BitMapHeader, public BitMapInfoHeader
 {
 private:
    uchar4 * pixels_;
    int numColors_;
    ColorPalette * colors_;
    bool isLoaded_;
    void releaseResources(void);
    int colorIndex(uchar4 color);
 public:
    //! \brief Default constructor
    BitMap()
        : pixels_(NULL),
          numColors_(0),
          colors_(NULL),
          isLoaded_(false)
    {}
    /*!\brief Constructor
     *
     * Tries to load bitmap image from filename provided.
     *
     * \param filename pointer to null terminated string that is the path and
     * filename to the bitmap image to be loaded.
     *
     * In the base of an error, e.g. the bitmap file could not be loaded for
     * some reason, then a following call to isLoaded will return false.
     */
    BitMap(const char * filename)
        : pixels_(NULL),
          numColors_(0),
          colors_(NULL),
          isLoaded_(false)
    {
        load(filename);
    }
    /*! \brief Copy constructor
     *
     * \param rhs is the bitmap to be copied (cloned).
     */
    BitMap(const BitMap& rhs)
    {
        *this = rhs;
    }
    //! \brief Destructor
    ~BitMap()
    {
        releaseResources();
    }
    /*! \brief Assignment
     * \param rhs is the bitmap to be assigned (cloned).
     */
    BitMap& operator=(const BitMap& rhs);
    /*! \brief Load Bitmap image
     *
     * \param filename is a pointer to a null terminated string that is the
     * path and filename name to the the bitmap file to be loaded.
     *
     * In the base of an error, e.g. the bitmap file could not be loaded for
     * some reason, then a following call to isLoaded will return false.
     */
    void
    load(const char * filename);
    /*! \brief Write Bitmap image
     *
     * \param filename is a pointer to a null terminated string that is the
     * path and filename name to the the bitmap file to be written.
     *
     * \return In the case that the bitmap is written true is returned. In
     * the case that a bitmap image is not already loaded or the write fails
     * for some reason false is returned.
     */
    bool
    write(const char * filename);
    /*! \brief Get image width
     *
     * \return If a bitmap image has been successfully loaded, then the width
     * image is returned, otherwise -1;
     */
    int
    getWidth(void) const
    {
        if (isLoaded_) {
            return width;
        }
        else {
            return -1;
        }
    }
    /*! \brief Get image height
     *
     * \return If a bitmap image has been successfully loaded, then the height
     * image is returned, otherwise -1.
     */
    int
    getHeight(void) const
    {
        if (isLoaded_) {
            return height;
        }
        else {
            return -1;
        }
    }
    /*! \brief Get image width
     *
     * \return If a bitmap image has been successfully loaded, then returns
     * a pointer to image's pixels, otherwise NULL.
     */
    const uchar4 *
    getPixels(void) const { return pixels_; }
    /*! \brief Is an image currently loaded
     *
     * \return If a bitmap image has been successfully loaded, then returns
     * true, otherwise if an image could not be loaded or an image has yet
     * to be loaded false is returned.
     */
    bool
    isLoaded(void) const { return isLoaded_; }
 };
 #pragma pack()
 }
 #endif // BMPLOADER_H_
--- a/Demos/OpenCLClothDemo/btOpenCLSupport.h
+++ b/Demos/OpenCLClothDemo/btOpenCLSupport.h
@@ -0,0 +1,84 @@
 #ifndef BT_OPENCL_SUPPORT_HPP
 #define BT_OPENCL_SUPPORT_HPP
 // OpenCL support
 #include <CL/cl.hpp>
 namespace BTAcceleratedSoftBody
 {
 	class OpenCLSupportHelper
 	{
 	private:
 		cl::Context m_context;
 		std::vector<cl::Device> m_devices;
 		cl::CommandQueue m_queue;
 	public:
 		OpenCLSupportHelper()
 		{
 		}
 		virtual ~OpenCLSupportHelper()
 		{
 		}
 		cl::Device getDevice()
 		{
 			return m_devices[0];
 		}
 		cl::CommandQueue getCommandQueue()
 		{
 			return m_queue;
 		}
 		cl::Context getContext()
 		{
 			return m_context;
 		}
 		bool InitOpenCLDevice()
 		{
 			cl_int err;
 			std::vector<cl::Platform> platforms;
 			err = cl::Platform::get(&platforms);
 			checkErr(platforms.size() != 0 ? CL_SUCCESS : -1, "Platform::get()");
 			std::string platformVendor;
 			platforms[0].getInfo(CL_PLATFORM_VENDOR, &platformVendor);
 			//std::cout << "Platform is by: " << platformVendor << "\n";
 			intptr_t properties[] = {
 				CL_CONTEXT_PLATFORM, (intptr_t)platforms[0](),
 				0, 0
 			};
 			m_context = cl::Context(
 				CL_DEVICE_TYPE_GPU, 
 				properties, 
 				NULL, 
 				NULL, 
 				&err);
 			if (err != CL_SUCCESS)
 			{
 				btAssert( "Context::Context()" );
 			}
 			m_devices = m_context.getInfo<CL_CONTEXT_DEVICES>();
 			if( m_devices.size() <= 0 ) 
 			{
 				btAssert( "devices.size() > 0" );
 			}
 			m_queue = cl::CommandQueue(m_context, m_devices[0], 0, &err);
 		    if (err != CL_SUCCESS) 
 			{
 				btAssert( "CommandQueue::CommandQueue()");
 			}
 		}
 	};
 } // namespace BTAcceleratedSoftBody
 #endif // #ifndef BT_OPENCL_SUPPORT_HPP
--- a/Demos/OpenCLClothDemo/cl_cloth_demo.cpp
+++ b/Demos/OpenCLClothDemo/cl_cloth_demo.cpp
@@ -0,0 +1,470 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2008 Advanced Micro Devices
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #ifdef _WIN32
 #include <GL/glew.h>
 #endif
 #include "clstuff.h"
 #include "gl_win.h"
 #include "cloth.h"
 #define USE_GPU_SOLVER
 const int numFlags = 5;
 const int clothWidth = 40;
 const int clothHeight = 60;//60;
 float _windAngle = 1.0;//0.4;
 float _windStrength = 15;
 #include <iostream>
 using namespace std;
 #include "btBulletDynamicsCommon.h"
 #include "LinearMath/btHashMap.h"
 #include "BulletSoftBody/btSoftRigidDynamicsWorld.h"
 #include "vectormath/vmInclude.h"
 #include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolver_CPU.h"
 #include "BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h"
 using Vectormath::Aos::Vector3;
 class piece_of_cloth;
 class btBroadphaseInterface;
 class btCollisionShape;
 class btOverlappingPairCache;
 class btCollisionDispatcher;
 class btConstraintSolver;
 struct btCollisionAlgorithmCreateFunc;
 class btDefaultCollisionConfiguration;
 namespace Vectormath
 {
 	namespace Aos
 	{
 		class Transform3;
 	}
 }
 btAlignedObjectArray<btCollisionShape*>	m_collisionShapes;
 btBroadphaseInterface*	m_broadphase;
 btCollisionDispatcher*	m_dispatcher;
 btConstraintSolver*	m_solver;
 btDefaultCollisionConfiguration* m_collisionConfiguration;
 btCPUSoftBodySolver *g_cpuSolver = NULL;
 btOpenCLSoftBodySolver *g_openCLSolver = NULL;
 btSoftBodySolver *g_solver = NULL;
 btAlignedObjectArray<btSoftBody *> m_flags;
 btSoftRigidDynamicsWorld* m_dynamicsWorld;
 btAlignedObjectArray<piece_of_cloth> cloths;
 extern cl_context			g_cxMainContext;
 extern cl_device_id		g_cdDevice;
 extern cl_command_queue	g_cqCommandQue;
 const float flagSpacing = 30.f;
 // Helper to test and add links correctly.
 // Records links that have already been generated
 static bool testAndAddLink( btAlignedObjectArray<int> &trianglesForLinks, btSoftBody *softBody, int triangle, int *triangleVertexIndexArray, int numVertices, int vertex0, int vertex1, int nonLinkVertex, btSoftBody::Material *structuralMaterial, bool createBendLinks, btSoftBody::Material *bendMaterial )
 {		
 	if( trianglesForLinks[ numVertices * vertex0 + vertex1 ] >= 0 && createBendLinks)
 	{
 		// Already have link so find other triangle and generate cross link
 		int otherTriangle = trianglesForLinks[numVertices * vertex0 + vertex1];
 		int otherIndices[3] = {triangleVertexIndexArray[otherTriangle * 3], triangleVertexIndexArray[otherTriangle * 3 + 1], triangleVertexIndexArray[otherTriangle * 3 + 2]};
 		int nodeA;
 		// Test all links of the other triangle against this link. The one that's not part of it is what we want.
 		if( otherIndices[0] != vertex0 && otherIndices[0] != vertex1 )
 			nodeA = otherIndices[0];
 		if( otherIndices[1] != vertex0 && otherIndices[1] != vertex1 )
 			nodeA = otherIndices[1];
 		if( otherIndices[2] != vertex0 && otherIndices[2] != vertex1 )
 			nodeA = otherIndices[2];
 		softBody->appendLink( nodeA, nonLinkVertex, bendMaterial );
 	} else {
 		// Don't yet have link so create it
 		softBody->appendLink( vertex0, vertex1, structuralMaterial );
 		// If we added a new link, set the triangle array
 		trianglesForLinks[numVertices * vertex0 + vertex1] = triangle;
 		trianglesForLinks[numVertices * vertex1 + vertex0] = triangle;
 	}
 	return true;
 }
 btSoftBody *createFromIndexedMesh( btVector3 *vertexArray, int numVertices, int *triangleVertexIndexArray, int numTriangles, bool createBendLinks )
 {
 	btSoftBody* softBody = new btSoftBody(&(m_dynamicsWorld->getWorldInfo()), numVertices, vertexArray, 0);
 	btSoftBody::Material * structuralMaterial = softBody->appendMaterial();
 	btSoftBody::Material * bendMaterial;
 	if( createBendLinks )
 	{
 		bendMaterial = softBody->appendMaterial();
 		bendMaterial->m_kLST = 0.7;
 	} else {
 		bendMaterial = NULL;
 	}
 	structuralMaterial->m_kLST = 1.0;
 	// List of values for each link saying which triangle is associated with that link
 	// -1 to start. Once a value is entered we know the "other" triangle
 	// and can add a link across the link
 	btAlignedObjectArray<int> triangleForLinks;
 	triangleForLinks.resize( numVertices * numVertices, -1 );
 	int numLinks = 0;
 	for( int triangle = 0; triangle < numTriangles; ++triangle )
 	{
 		int index[3] = {triangleVertexIndexArray[triangle * 3], triangleVertexIndexArray[triangle * 3 + 1], triangleVertexIndexArray[triangle * 3 + 2]};
 		softBody->appendFace( index[0], index[1], index[2] );
 		// Generate the structural links directly from the triangles
 		testAndAddLink( triangleForLinks, softBody, triangle, triangleVertexIndexArray, numVertices, index[0], index[1], index[2], structuralMaterial, createBendLinks, bendMaterial );
 		testAndAddLink( triangleForLinks, softBody, triangle, triangleVertexIndexArray, numVertices, index[1], index[2], index[0], structuralMaterial, createBendLinks, bendMaterial );
 		testAndAddLink( triangleForLinks, softBody, triangle, triangleVertexIndexArray, numVertices, index[2], index[0], index[1], structuralMaterial, createBendLinks, bendMaterial);
 	}
 	return softBody;
 }
 /**
 * Create a sequence of flag objects and add them to the world.
 */
 void createFlag( btSoftBodySolver &solver, int width, int height, btAlignedObjectArray<btSoftBody *> &flags )
 {
 	// First create a triangle mesh to represent a flag
 	using Vectormath::Aos::Matrix3;
 	using Vectormath::Aos::Vector3;
 	// Allocate a simple mesh consisting of a vertex array and a triangle index array
 	btIndexedMesh mesh;
 	mesh.m_numVertices = width*height;
 	mesh.m_numTriangles = 2*(width-1)*(height-1);
 	btVector3 *vertexArray = new btVector3[mesh.m_numVertices];
 	mesh.m_vertexBase = reinterpret_cast<const unsigned char*>(vertexArray);
 	int *triangleVertexIndexArray = new int[3*mesh.m_numTriangles];	
 	mesh.m_triangleIndexBase = reinterpret_cast<const unsigned char*>(triangleVertexIndexArray);
 	mesh.m_triangleIndexStride = sizeof(int)*3;
 	mesh.m_vertexStride = sizeof(Vector3);
 	// Generate normalised object space vertex coordinates for a rectangular flag
 	float zCoordinate = 0.0f;
 	Matrix3 defaultScale(Vector3(5.f, 0.f, 0.f), Vector3(0.f, 20.f, 0.f), Vector3(0.f, 0.f, 1.f));
 	for( int y = 0; y < height; ++y )
 	{
 		float yCoordinate = y*2.0f/float(height) - 1.0f;
 		for( int x = 0; x < width; ++x )
 		{			
 			float xCoordinate = x*2.0f/float(width) - 1.0f;
 			Vector3 vertex(xCoordinate, yCoordinate, zCoordinate);
 			Vector3 transformedVertex = defaultScale*vertex;
 			vertexArray[y*width + x] = btVector3(transformedVertex.getX(), transformedVertex.getY(), transformedVertex.getZ() );
 		}
 	}
 	// Generate vertex indices for triangles
 	for( int y = 0; y < (height-1); ++y )
 	{
 		for( int x = 0; x < (width-1); ++x )
 		{	
 			// Triangle 0
 			// Top left of square on mesh
 			{
 				int vertex0 = y*width + x;
 				int vertex1 = vertex0 + 1;
 				int vertex2 = vertex0 + width;
 				int triangleIndex = 2*y*(width-1) + 2*x;
 				triangleVertexIndexArray[(mesh.m_triangleIndexStride*triangleIndex)/sizeof(int)] = vertex0;
 				triangleVertexIndexArray[(mesh.m_triangleIndexStride*triangleIndex+1)/sizeof(int)+1] = vertex1;
 				triangleVertexIndexArray[(mesh.m_triangleIndexStride*triangleIndex+2)/sizeof(int)+2] = vertex2;
 			}
 			// Triangle 1
 			// Bottom right of square on mesh
 			{
 				int vertex0 = y*width + x + 1;
 				int vertex1 = vertex0 + width;
 				int vertex2 = vertex1 - 1;
 				int triangleIndex = 2*y*(width-1) + 2*x + 1;
 				triangleVertexIndexArray[(mesh.m_triangleIndexStride*triangleIndex)/sizeof(int)] = vertex0;
 				triangleVertexIndexArray[(mesh.m_triangleIndexStride*triangleIndex)/sizeof(int)+1] = vertex1;
 				triangleVertexIndexArray[(mesh.m_triangleIndexStride*triangleIndex)/sizeof(int)+2] = vertex2;
 			}
 		}
 	}
 	float rotateAngleRoundZ = 0.5;
 	float rotateAngleRoundX = 0.5;
 	btMatrix3x3 defaultRotate;
 	defaultRotate[0] = btVector3(cos(rotateAngleRoundZ), sin(rotateAngleRoundZ), 0.f); 
 	defaultRotate[1] = btVector3(-sin(rotateAngleRoundZ), cos(rotateAngleRoundZ), 0.f);
 	defaultRotate[2] = btVector3(0.f, 0.f, 1.f);
 	btMatrix3x3 defaultRotateX;
 	defaultRotateX[0] = btVector3(1.f, 0.f, 0.f);
 	defaultRotateX[1] = btVector3( 0.f, cos(rotateAngleRoundX), sin(rotateAngleRoundX));
 	defaultRotateX[2] = btVector3(0.f, -sin(rotateAngleRoundX), cos(rotateAngleRoundX));
 	btMatrix3x3 defaultRotateAndScale( (defaultRotateX*defaultRotate) );
 	// Construct the sequence flags applying a slightly different translation to each one to arrange them
 	// appropriately in the scene.
 	for( int i = 0; i < numFlags; ++i )
 	{
 		float zTranslate = flagSpacing * (i-numFlags/2);
 		btVector3 defaultTranslate(0.f, 20.f, zTranslate);
 		btTransform transform( defaultRotateAndScale, defaultTranslate );
 		btSoftBody *softBody = createFromIndexedMesh( vertexArray, mesh.m_numVertices, triangleVertexIndexArray, mesh.m_numTriangles, true );
 		for( int i = 0; i < mesh.m_numVertices; ++i )
 		{
 			softBody->setMass(i, 10.f/mesh.m_numVertices);
 		}
 		softBody->setMass((height-1)*(width), 0.f);
 		softBody->setMass((height-1)*(width) + width - 1, 0.f);
 		softBody->setMass((height-1)*width + width/2, 0.f);
 		softBody->m_cfg.collisions = btSoftBody::fCollision::CL_SS+btSoftBody::fCollision::CL_RS;	
 		flags.push_back( softBody );
 		softBody->transform( transform );
 		m_dynamicsWorld->addSoftBody( softBody );
 	}
 	delete [] vertexArray;
 	delete [] triangleVertexIndexArray;
 }
 void updatePhysicsWorld()
 {
 	static int counter = 0;
 	// Change wind velocity a bit based on a frame counter
 	if( (counter % 400) == 0 )
 	{
 		_windAngle = (_windAngle + 0.05f);
 		if( _windAngle > (2*3.141) )
 			_windAngle = 0;
 		for( int flagIndex = 0; flagIndex < m_flags.size(); ++flagIndex )
 		{		
 			btSoftBody *cloth = 0;
 			cloth = m_flags[flagIndex];
 			float localWind = _windAngle + 0.5*(((float(rand())/RAND_MAX))-0.1);
 			float xCoordinate = cos(localWind)*_windStrength;
 			float zCoordinate = sin(localWind)*_windStrength;
 			cloth->setWindVelocity( btVector3(xCoordinate, 0, zCoordinate) );
 		}
 	}
 	//btVector3 origin( capCollider->getWorldTransform().getOrigin() );
 	//origin.setX( origin.getX() + 0.05 );
 	//capCollider->getWorldTransform().setOrigin( origin );
 	counter++;
 }
 void initBullet(void)
 {
 #ifdef USE_GPU_SOLVER
 	g_openCLSolver = new btOpenCLSoftBodySolver( g_cqCommandQue, g_cxMainContext);
 	g_solver = g_openCLSolver;
 #else
 	g_cpuSolver = new btCPUSoftBodySolver;
 	g_solver = g_cpuSolver;
 #endif
 	m_collisionConfiguration = new btDefaultCollisionConfiguration();
 	m_dispatcher = new	btCollisionDispatcher(m_collisionConfiguration);
 	m_broadphase = new btDbvtBroadphase();
 	btSequentialImpulseConstraintSolver* sol = new btSequentialImpulseConstraintSolver;
 	m_solver = sol;
 	m_dynamicsWorld = new btSoftRigidDynamicsWorld(m_dispatcher, m_broadphase, m_solver, m_collisionConfiguration, g_solver);	
 	m_dynamicsWorld->setGravity(btVector3(0,-10,0));	
 	btCollisionShape* groundShape = new btBoxShape(btVector3(btScalar(50.),btScalar(50.),btScalar(50.)));	
 	m_collisionShapes.push_back(groundShape);
 	btTransform groundTransform;
 	groundTransform.setIdentity();
 	groundTransform.setOrigin(btVector3(0,-50,0));
 	m_dynamicsWorld->getWorldInfo().air_density			=	(btScalar)1.2;
 	m_dynamicsWorld->getWorldInfo().water_density		=	0;
 	m_dynamicsWorld->getWorldInfo().water_offset		=	0;
 	m_dynamicsWorld->getWorldInfo().water_normal		=	btVector3(0,0,0);
 	m_dynamicsWorld->getWorldInfo().m_gravity.setValue(0,-10,0);
 #if 0
 	{
 		btScalar mass(0.);
 		//rigidbody is dynamic if and only if mass is non zero, otherwise static
 		bool isDynamic = (mass != 0.f);
 		btVector3 localInertia(0,0,0);
 		if (isDynamic)
 			groundShape->calculateLocalInertia(mass,localInertia);
 		//using motionstate is recommended, it provides interpolation capabilities, and only synchronizes 'active' objects
 		btDefaultMotionState* myMotionState = new btDefaultMotionState(groundTransform);
 		btRigidBody::btRigidBodyConstructionInfo rbInfo(mass,myMotionState,groundShape,localInertia);
 		btRigidBody* body = new btRigidBody(rbInfo);
 		//add the body to the dynamics world
 		m_dynamicsWorld->addRigidBody(body);
 	}
 #endif
 #ifdef USE_GPU_SOLVER
 	createFlag( *g_openCLSolver, clothWidth, clothHeight, m_flags );
 #else
 	createFlag( *g_cpuSolver, clothWidth, clothHeight, m_flags );
 #endif
 	// Create output buffer descriptions for ecah flag
 	// These describe where the simulation should send output data to
 	for( int flagIndex = 0; flagIndex < m_flags.size(); ++flagIndex )
 	{		
 //		m_flags[flagIndex]->setWindVelocity( Vectormath::Aos::Vector3( 0.f, 0.f, 15.f ) );
 		// In this case we have a DX11 output buffer with a vertex at index 0, 8, 16 and so on as well as a normal at 3, 11, 19 etc.
 		// Copies will be performed GPU-side directly into the output buffer
 		btCPUVertexBufferDescriptor *vertexBufferDescriptor = new btCPUVertexBufferDescriptor(reinterpret_cast< float* >(cloths[flagIndex].cpu_buffer), 0, 8, 3, 8);
 		cloths[flagIndex].m_vertexBufferDescriptor = vertexBufferDescriptor;
 	}
 	g_solver->optimize( m_dynamicsWorld->getSoftBodyArray() );
 }
 btClock m_clock;
 void doFlags()
 {
 	//float ms = getDeltaTimeMicroseconds();
 	btScalar dt = (btScalar)m_clock.getTimeMicroseconds();
 	m_clock.reset();
 	///step the simulation
 	if( m_dynamicsWorld )
 	{
 		m_dynamicsWorld->stepSimulation(dt/1000000.);
 		static int frameCount = 0;
 		frameCount++;
 		if (frameCount==100)
 		{
 			m_dynamicsWorld->stepSimulation(1./60.,0);
 			CProfileManager::dumpAll();
 		}
 		updatePhysicsWorld();
 	}
 	for( int flagIndex = 0; flagIndex < m_flags.size(); ++flagIndex )
 	{
 		g_solver->copySoftBodyToVertexBuffer( m_flags[flagIndex], cloths[flagIndex].m_vertexBufferDescriptor );
 		cloths[flagIndex].draw();
 	}
 }
 int main(int argc, char *argv[])
 {
 	initCL();
 	cloths.resize(numFlags);
 	for( int flagIndex =  0; flagIndex < numFlags; ++flagIndex )
 	{
 		cloths[flagIndex].create_buffers(clothWidth, clothHeight);
 	}
 	initBullet();
 	m_dynamicsWorld->stepSimulation(1./60.,0);
 	preInitGL(argc, argv);
 	std::string flagTexs[] = {
 		"atiFlag.bmp",
 		"amdFlag.bmp",
 	};
 	int numFlagTexs = 2;
 	for( int flagIndex =  0; flagIndex < numFlags; ++flagIndex )
 	{
 		cloths[flagIndex].create_texture(flagTexs[flagIndex % numFlagTexs]);
 		cloths[flagIndex].x_offset = 0; 
 		cloths[flagIndex].y_offset = 0; 
 		cloths[flagIndex].z_offset = 0;
 	}
 	goGL();
 	return 0;
 }
--- a/Demos/OpenCLClothDemo/cloth.h
+++ b/Demos/OpenCLClothDemo/cloth.h
@@ -0,0 +1,183 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2008 Advanced Micro Devices
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #include "gl_win.h" //for OpenGL stuff
 #include "bmpLoader.h"
 #include <string>
 #include "LinearMath/btScalar.h"
 struct vertex_struct 
 {
 	float pos[3];
 	float normal[3];
 	float texcoord[2];
 };
 class btVertexBufferDescriptor;
 class piece_of_cloth 
 {
 	public:
 	void destroy(void)
 	{
 		if(created)
 		{
 			if(cpu_buffer) delete [] cpu_buffer;
 		}
 	}
 	piece_of_cloth()
 	{
 		created = false;
 		cpu_buffer = NULL;
 		m_vertexBufferDescriptor = NULL;
 	}
 	bool created;
 	vertex_struct* cpu_buffer;
 	unsigned int* indices;
 	btVertexBufferDescriptor *m_vertexBufferDescriptor;
 	double x_offset, y_offset, z_offset;
 	int width;
 	int height;
 	GLuint texture;
 	void draw(void)
 	{
 		glEnable(GL_TEXTURE_2D);
 		glBindTexture (GL_TEXTURE_2D, texture);
 		glEnable(GL_DEPTH_TEST);
 		glColor3f(0.0f, 1.0f, 1.0f);
 		glEnableClientState(GL_VERTEX_ARRAY);
 		//glEnableClientState(GL_NORMAL_ARRAY);
 		glEnableClientState(GL_TEXTURE_COORD_ARRAY);
 		glBindTexture(GL_TEXTURE_2D, texture);
 		glVertexPointer( 3, GL_FLOAT, sizeof(vertex_struct), reinterpret_cast< GLvoid* >(&(cpu_buffer[0].pos[0])) );
 		//glNormalPointer( 3, sizeof(vertex_struct), reinterpret_cast< GLvoid* >(&(cpu_buffer[0].normal[0])) );
 		glTexCoordPointer( 2, GL_FLOAT, sizeof(vertex_struct), reinterpret_cast< GLvoid* >(&(cpu_buffer[0].texcoord[0])) );
 		glDrawElements(GL_TRIANGLES, (height-1  )*(width-1)*3*2, GL_UNSIGNED_INT, indices);
 //		glDisableClientState(GL_NORMAL_ARRAY);
 		glDisableClientState(GL_VERTEX_ARRAY);
 		glDisableClientState(GL_TEXTURE_COORD_ARRAY);
 		glBindTexture(GL_TEXTURE_2D, 0);
 	}
 	void create_texture(std::string filename)
 	{
 		amd::BitMap texBMP(filename.c_str());
 		if ( texBMP.isLoaded() ) {
 			glEnable(GL_TEXTURE_2D);
 			glGenTextures(1, &texture);
 			glBindTexture(GL_TEXTURE_2D, texture);
 			glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
 			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
 			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
 			glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
 			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
 			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
 			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
 			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
 			glTexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_DECAL);
 			glTexImage2D(
 				GL_TEXTURE_2D,
 				0,
 				GL_RGBA8,
 				texBMP.getWidth(),
 				texBMP.getHeight(),
 				0,
 				GL_RGBA,
 				GL_UNSIGNED_BYTE,
 				texBMP.getPixels());
 			glBindTexture(GL_TEXTURE_2D, 0);
 		}
 		else {
 			std::cout << "ERROR: could not load bitmap " << "texture.bmp" << std::endl;
 			exit(1);
 		}
 	}
 	void create_buffers(int width_, int height_)
 	{	    
 		width = width_;
 		height = height_;
 		created = true;
 		cpu_buffer = new vertex_struct[width*height];
 		memset(cpu_buffer, 0, width*height*sizeof(vertex_struct));
 		// Initial test data for rendering
 		for(int y = 0; y < height; y++)
 		{
 			for(int x = 0; x < width; x++)
 			{
 				double coord = btSin(x/5.0)*0.01;
 				//coord = sin(y/);
 				cpu_buffer[y*width+x].pos[0]      = (x/((float)(width-1)))*1;
 				cpu_buffer[y*width+x].pos[1]      = coord;
 				cpu_buffer[y*width+x].pos[2]      = (y/((float)(height-1)))*1; 
 				cpu_buffer[y*width+x].normal[0]   = 1;
 				cpu_buffer[y*width+x].normal[1]   = 0;
 				cpu_buffer[y*width+x].normal[2]   = 0;
 				cpu_buffer[y*width+x].texcoord[0] = x/((float)(width-1));
 				cpu_buffer[y*width+x].texcoord[1] = y/((float)(height-1));
 			}
 		}
 		// Generate and fill index array for rendering
 		indices = new unsigned int[width*3*2+2 + height*width*3*2];
 		for(int y = 0; y < height-1; y++)
 		{
 			for(int x = 0; x < width-1; x++)
 			{
 				// *3 indices/triangle, *2 triangles/quad
 				int baseIndex = (x + y*(width-1))*3*2;
 				indices[baseIndex] = x + y*width;
 				indices[baseIndex+1] = x+1 + y*width;
 				indices[baseIndex+2] = x+width + y*width;
 				indices[baseIndex+3] = x + 1 +  y*width;
 				indices[baseIndex+4] = x+(width+1) + y*width;
 				indices[baseIndex+5] = x+width + y*width;
 			}
 		}
 	}
 };
--- a/Demos/OpenCLClothDemo/clstuff.cpp
+++ b/Demos/OpenCLClothDemo/clstuff.cpp
@@ -0,0 +1,53 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2008 Advanced Micro Devices
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #include "clstuff.h"
 #include "gl_win.h"
 #include "btOclCommon.h"
 #include "btOclUtils.h"
 #include "LinearMath/btScalar.h"
 cl_context			g_cxMainContext;
 cl_device_id		g_cdDevice;
 cl_command_queue	g_cqCommandQue;
 void initCL(void)
 {
 	int ciErrNum = 0;
    //g_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum);
 	//g_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum);
 	//g_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_CPU, &ciErrNum);
 	//try CL_DEVICE_TYPE_DEBUG for sequential, non-threaded execution, when using MiniCL on CPU, it gives a full callstack at the crash in the kernel
 //#ifdef USE_MINICL
 //	g_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_DEBUG, &ciErrNum);
 //#else
 	g_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum);
 //#endif
 	oclCHECKERROR(ciErrNum, CL_SUCCESS);
 	g_cdDevice = btOclGetMaxFlopsDev(g_cxMainContext);
 	btOclPrintDevInfo(g_cdDevice);
 	// create a command-queue
 	g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, g_cdDevice, 0, &ciErrNum);
 	oclCHECKERROR(ciErrNum, CL_SUCCESS);
 }
--- a/Demos/OpenCLClothDemo/clstuff.h
+++ b/Demos/OpenCLClothDemo/clstuff.h
@@ -0,0 +1,10 @@
 #ifndef __CLSTUFF_HDR__
 #define __CLSTUFF_HDR__
 void initCL(void);
 #endif //__CLSTUFF_HDR__
--- a/Demos/OpenCLClothDemo/clstuff.hpp
+++ b/Demos/OpenCLClothDemo/clstuff.hpp
@@ -0,0 +1,10 @@
 #ifndef __CLSTUFF_HDR__
 #define __CLSTUFF_HDR__
 void initCL(void);
 #endif //__CLSTUFF_HDR__
--- a/Demos/OpenCLClothDemo/fragment.glsl
+++ b/Demos/OpenCLClothDemo/fragment.glsl
@@ -0,0 +1,7 @@
 uniform sampler2D tex;
 void main()
 {
 	vec4 color   = texture2D(tex,gl_TexCoord[0].st);
 	gl_FragColor = color;
 }
--- a/Demos/OpenCLClothDemo/gl_win.cpp
+++ b/Demos/OpenCLClothDemo/gl_win.cpp
@@ -0,0 +1,272 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2008 Advanced Micro Devices
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #include "clstuff.h"
 #include "gl_win.h"
 #include <cstdlib>
 #include <iostream>
 #include <fstream>
 #include <string>
 #include <iterator>
 #include <math.h>
 #include <cmath>
 #include <cstring>
 //#ifndef _WIN32 && !defined(__APPLE__)
 //#include <GL/glx.h>
 //#endif //!_WIN32
 static GLuint vbo = 0;
 #ifdef _WIN32
 #include <windows.h>
 #endif
 static unsigned int windowWidth  = 1280;
 static unsigned int windowHeight = 1024;
 // mouse controls
 int mouseOldX;
 int mouseOldY;
 int mouseButtons         = 0;
 float rotateX;
 float rotateY;
 float translateZ;
 float translateX;
 float translateY;
 static GLuint glProgram;
 void doFlags();
 void render( void)
 {
 	glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
 //	glDisable ( GL_CULL_FACE );
 	glMatrixMode( GL_MODELVIEW );
 	glLoadIdentity();
 	glTranslatef( translateX, translateY, translateZ );
 	glRotatef( rotateX, 0.5f , 0.0f, 0.0f );
 	glRotatef( rotateY, 0.0f, 0.5f, 0.0f );
 //	glDisable (GL_BLEND);
 	doFlags();
 	// TODO:
 	//glBindBuffer(GL_ARRAY_BUFFER, vbo);
 	//glVertexPointer(4, GL_FLOAT, 0, NULL);
    //glEnableClientState(GL_VERTEX_ARRAY);
 	//glDrawArrays(GL_POINTS, 0, 4*4);
 //	glDisableClientState(GL_VERTEX_ARRAY);
 //   glBindBuffer(GL_ARRAY_BUFFER, 0);
 //	glUseProgram(0);
 }
 static void initGL(void) 
 {
 	//glClearColor( 0.05f, 0.0f, 0.1f, 0.1f );
 	glClearColor(  0.0f, 0.45f, 0.45f, 1.f);
 #if 0
 	GLfloat mat_specular[] = { 1.0f, 1.0f, 1.0f, 1.0f };
 	GLfloat mat_shininess[] = { 50.0f };
 	GLfloat light_position[] = { 
 	   -10.f, 
 	   5.f, 
 	   -1.f, 
 	   1.0f };
 	glEnable ( GL_COLOR_MATERIAL );
 	glShadeModel( GL_SMOOTH );
 	glEnable( GL_LINE_SMOOTH );
 	glMaterialfv( GL_FRONT, GL_SPECULAR, mat_specular );
 	glMaterialfv( GL_FRONT, GL_SHININESS, mat_shininess );
 	glLightfv( GL_LIGHT0, GL_POSITION, light_position );
 	//glEnable( GL_LIGHTING );
 	//glEnable( GL_LIGHT0 ); // Switch on and crashes!
 	glEnable( GL_DEPTH_TEST );
 #endif 
 #if 0
   glEnable ( GL_COLOR_MATERIAL );
   glShadeModel( GL_SMOOTH );
   glEnable( GL_LINE_SMOOTH );
   glMaterialfv( GL_FRONT, GL_SPECULAR, mat_specular );
   glMaterialfv( GL_FRONT, GL_SHININESS, mat_shininess );
   glLightfv( GL_LIGHT0, GL_POSITION, light_position );
   glEnable( GL_LIGHTING );
   glEnable( GL_LIGHT0 );
   glEnable( GL_DEPTH_TEST );
 #endif
   rotateX    = 0;
   rotateY    = 30;
   translateX = 0.0f;
   translateY = -30.0f;
   translateZ = -120.0;
 }
 void display(void)
 {
 	render();
 	glutSwapBuffers();
 	glutPostRedisplay();
 }
 void keyboard( unsigned char key, int /*x*/, int /*y*/)
 {
  switch( key) {
  case('q') :
 #ifdef _WIN32
  case VK_ESCAPE:
 #endif //_WIN32
    exit(0);
  break;
  case('a'):
    translateY += 0.1f;
    break;
  case('z'):
    translateY -= 0.1f;
    break;
  case('d'):
    translateX += 0.1f;
    break;
  case('s'):
    translateX -= 0.1f;
    break;
  case('f'):
    translateZ += 0.1f;
    break;
  case('g'):
    translateZ -= 0.1f;
    break;
  }
 }
 void mouse(int button, int state, int x, int y)
 {
  if (state == GLUT_DOWN) {
    mouseButtons |= 1<<button;
  } else if (state == GLUT_UP) {
    mouseButtons = 0;
  }
  mouseOldX = x;
  mouseOldY = y;
  glutPostRedisplay();
 }
 void motion(int x, int y)
 {
  float dx, dy;
  dx = x - mouseOldX;
  dy = y - mouseOldY;
  if (mouseButtons & 1) {
    rotateX += dy * 0.2;
    rotateY += dx * 0.2;
  } 
  else if (mouseButtons & 5) {
    translateY -= dy * 0.01;
    translateX -= dx * 0.01;
  }
  else if (mouseButtons & 4) {
    translateZ += dy * 0.01;
  } 
  mouseOldX = x;
  mouseOldY = y;
 }
 void reshape (int w, int h)
 {
 	windowWidth  = w;
 	windowHeight = h;
 	glViewport(0, 0, windowWidth, windowHeight);
 	glMatrixMode(GL_PROJECTION);
 	glLoadIdentity();
 	gluPerspective(
 		60.0,
 		(GLfloat)windowWidth / (GLfloat) windowHeight,
 		0.1,
 		600.0f );
 }
 void goGL(void)
 {
 	glutMainLoop();
 }
 void preInitGL(int argc, char ** argv)
 {
 	glutInit( &argc, argv );
 	glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH );
 	glutInitWindowSize( windowWidth, windowHeight );    
 	glutCreateWindow ("OpenCL Renderer");
 	initGL();
 	glViewport( 0, 0, windowWidth, windowHeight);
 	reshape( windowWidth, windowHeight );
 	glutDisplayFunc(display); 
 	glutReshapeFunc(reshape);
 	glutKeyboardFunc(keyboard);
 	glutMouseFunc(mouse);
 	glutMotionFunc(motion);
 }
 /*
 int getVBO( std::string, int s)
 {
 	GLuint size = (GLuint)s;
 	if (vbo == 0) {
 		// Create VBO
 		// create buffer object
 		glGenBuffers(1, &vbo);
 		glBindBuffer(GL_ARRAY_BUFFER, vbo);
 		glBufferData(GL_ARRAY_BUFFER, size, 0, GL_STATIC_DRAW);
 		glBindBuffer(GL_ARRAY_BUFFER, 0);
 	}
 	return vbo;
 }
 */
--- a/Demos/OpenCLClothDemo/gl_win.h
+++ b/Demos/OpenCLClothDemo/gl_win.h
@@ -0,0 +1,49 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2008 Advanced Micro Devices
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #ifndef __GL_WIN_HDR__
 #define __GL_WIN_HDR__
 #ifdef _WIN32//for glut.h
 #include <windows.h>
 #endif
 //think different
 #if defined(__APPLE__) && !defined (VMDMESA)
 #include <OpenGL/OpenGL.h>
 #include <OpenGL/gl.h>
 #include <OpenGL/glu.h>
 #include <GLUT/glut.h>
 #else
 #ifdef _WINDOWS
 #include <windows.h>
 #include <GL/gl.h>
 #include <GL/glu.h>
 #else
 #include <GL/glut.h>
 #endif //_WINDOWS
 #endif //APPLE
 #include <string>
 void goGL(void);
 void preInitGL(int argc, char ** argv);
 //int getVBO( std::string, int size );
 #endif //__GL_WIN_HDR__
--- a/Demos/OpenCLClothDemo/gl_win.hpp
+++ b/Demos/OpenCLClothDemo/gl_win.hpp
@@ -0,0 +1,34 @@
 #ifndef __GL_WIN_HDR__
 #define __GL_WIN_HDR__
 #ifdef _WIN32//for glut.h
 #include <windows.h>
 #endif
 //think different
 #if defined(__APPLE__) && !defined (VMDMESA)
 #include <OpenGL/OpenGL.h>
 #include <OpenGL/gl.h>
 #include <OpenGL/glu.h>
 #include <GLUT/glut.h>
 #else
 #ifdef _WINDOWS
 #include <windows.h>
 #include <GL/gl.h>
 #include <GL/glu.h>
 #else
 #include <GL/glut.h>
 #endif //_WINDOWS
 #endif //APPLE
 #include <string>
 void goGL(void);
 void preInitGL(int argc, char ** argv);
 int getVBO( std::string, int size );
 #endif //__GL_WIN_HDR__
--- a/Demos/OpenCLClothDemo/shaders.cl
+++ b/Demos/OpenCLClothDemo/shaders.cl
@@ -0,0 +1,535 @@
 #pragma OPENCL EXTENSION cl_amd_printf : enable                        
 #define float3 float4
 #define uint3  uint4
 #define PARTICLE_RADIUS 0.05;
 #define width 1280
 #define height 1024
 #define B 0 
 #define T height
 #define L 0
 #define R width
 #define shiftNumber 4
 #define shiftMask 0xF
 #define shiftValue 16.0f
 #define stride 4
 #define screenWidth1 width
 #define screenHeight1 height
 #define halfScreenWidth1 screenWidth1/2
 #define halfScreenHeight1 screenHeight1/2
 #define screenWidth1SubOne (screenWidth1-1)
 #define screenHeight1SubOne (screenHeight1-1)
 #define stride screenWidth1 
 #define screenPixelNumber screenWidth1*screenHeight1
 #define depthBufferSize screenPixelNumber*depthComplexity
 #define WGS 1
 //---------------------------------------------------------------
 struct __VSSpriteOut
 {
    float4 position; 
    float4 particlePosition; 
 };
 typedef struct __VSSpriteout VSSpriteOut;
 struct __GSSpriteOut
 {
    float4 position;
    float2 textureUV;
 //	float4 viewSpacePosition;
 //	float4 particlePosition;
 };
 typedef struct __GSSpriteout GSSpriteOut;
 //------------------------------------------------------------------------------
 __constant float4 g_positions[4] =
 {
  (float4)(-1.0f, 1.0f, 0.0f, 0.0f),
  (float4)( 1.0f, 1.0f, 0.0f, 0.0f),
  (float4)( -1.0f, -1.0f, 0.0f, 0.0f),
  (float4)( 1.0f, -1.0f, 0.0f, 0.0f)
 };
 __constant float2 g_texcoords[4] = 
 { 
 	(float2)(0.0f,0.0f), 
    (float2)(1.0f,0.0f),
    (float2)(0.0f,1.0f),
    (float2)(1.0f,1.0f)
 };
 //------------------------------------------------------------------------------
 void copyMatrix(
 	float matrix[16],
 	__constant float matrix0[16])
 {
 	uint i;
 	for (i = 0; i < 16; i++) {
 		matrix[i] = matrix0[i];
 	}
 }
 void matrixMulLoopBody(	
 	uint i,
 	float matrix[16], 
 	__constant float matrix0[16], 
 	__constant float matrix1[16])
 {
 	matrix[i] = 0.0f;
 	matrix[i] += matrix0[(i%4) + (0*4)] * matrix1[(0) + ((i/4)*4)];
 	matrix[i] += matrix0[(i%4) + (1*4)] * matrix1[(1) + ((i/4)*4)];
 	matrix[i] += matrix0[(i%4) + (2*4)] * matrix1[(2) + ((i/4)*4)];
 	matrix[i] += matrix0[(i%4) + (3*4)] * matrix1[(3) + ((i/4)*4)];
 }
 void matrixMul(
 	float matrix[16], 
 	__constant float matrix0[16], 
 	__constant float matrix1[16])
 {
 	matrixMulLoopBody(0, matrix, matrix0, matrix1);
 	matrixMulLoopBody(1, matrix, matrix0, matrix1);
 	matrixMulLoopBody(2, matrix, matrix0, matrix1);	
 	matrixMulLoopBody(3, matrix, matrix0, matrix1);
 	matrixMulLoopBody(4, matrix, matrix0, matrix1);
 	matrixMulLoopBody(5, matrix, matrix0, matrix1);
 	matrixMulLoopBody(6, matrix, matrix0, matrix1);	
 	matrixMulLoopBody(7, matrix, matrix0, matrix1);
 	matrixMulLoopBody(8, matrix, matrix0, matrix1);
 	matrixMulLoopBody(9, matrix, matrix0, matrix1);
 	matrixMulLoopBody(10, matrix, matrix0, matrix1);	
 	matrixMulLoopBody(11, matrix, matrix0, matrix1);
 	matrixMulLoopBody(12, matrix, matrix0, matrix1);
 	matrixMulLoopBody(13, matrix, matrix0, matrix1);
 	matrixMulLoopBody(14, matrix, matrix0, matrix1);	
 	matrixMulLoopBody(15, matrix, matrix0, matrix1);						
 }
 float4 matrixVectorMul(float matrix[16], float4 vector)
 {
 	float4 result;
 	result.x = matrix[0]*vector.x + matrix[4+0]*vector.y + matrix[8+0]*vector.z + matrix[12+0]*vector.w;
 	result.y = matrix[1]*vector.x + matrix[4+1]*vector.y + matrix[8+1]*vector.z + matrix[12+1]*vector.w;
 	result.z = matrix[2]*vector.x + matrix[4+2]*vector.y + matrix[8+2]*vector.z + matrix[12+2]*vector.w;
 	result.w = matrix[3]*vector.x + matrix[4+3]*vector.y + matrix[8+3]*vector.z + matrix[12+3]*vector.w;
 	return result;
 }
 float3 matrixVector3Mul(__constant float matrix[9], float3 vector)
 {
 	float3 result;
 	result.x = matrix[0]*vector.x + matrix[3+0]*vector.y + matrix[6+0]*vector.z;
 	result.y = matrix[1]*vector.x + matrix[3+1]*vector.y + matrix[6+1]*vector.z;
 	result.z = matrix[2]*vector.x + matrix[3+2]*vector.y + matrix[6+2]*vector.z;
 	return result;
 }
 //------------------------------------------------------------------------------
 //#define DEVICE_CPU 1
 #if defined(DEVICE_CPU)
 void printMatrix(char * name, __constant float matrix[16])
 {
 	printf("%s[0] = %f, %f, %f, %f\n", name, matrix[0], matrix[1], matrix[2], matrix[3]);	
 	printf("%s[1] = %f, %f, %f, %f\n", name, matrix[4], matrix[5], matrix[6], matrix[7]);	
 	printf("%s[2] = %f, %f, %f, %f\n", name, matrix[8], matrix[9], matrix[10], matrix[11]);	
 	printf("%s[3] = %f, %f, %f, %f\n", name, matrix[12], matrix[13], matrix[14], matrix[15]);	
 }
 #endif
 #if 1
 __kernel void vertexShader(
    __constant float modelview[16],
 	__constant float projection[16],
 	__global float4 * inputPrimitives, 
 	__global float4 * outputPrimitives)
 {
 	float matrix[16];
 	float4 gl_Vertex;
 	float4 gl_Position;
 	uint id = get_global_id(0);
 	gl_Vertex = inputPrimitives[id];
 	// gl_ProjectionMatrix * gl_ModelViewMatrix * gl_Vertex
 	matrixMul(matrix, projection, modelview);
 	gl_Position = matrixVectorMul(matrix, gl_Vertex);
 	outputPrimitives[id] = gl_Position;
 }
 #else
 __kernel void vertexShader(
    __constant float modelview[16],
 	__constant float projection[16],
 	__global float4 * inputPrimitives, 
 	__global float4 * outputPrimitives)
 {
 	uint id = get_global_id(0);
 	outputPrimitives[id] = inputPrimitives[id];
 }
 #endif
 //-----------------------------------------------------------------------------------
 __kernel void
 clearImage(
 	__write_only image2d_t image,
 	float4 color)
 {
 	int2 coords = (int2)(get_global_id(0), get_global_id(1));
 	write_imagef(image, coords, color);
 }
 // OpenGL viewport transformation
 // The site http://research.cs.queensu.ca/~jstewart/454/notes/pipeline/
 // contains a description of this process
 void 
 viewportTransform(float4 v, __constant int4 viewport[1], float2 * output)
 {
 	int4 vp = viewport[0];
 	*output 
 		= 0.5f * 
 		  (float2)(v.x+1,v.y+1) * 
 		  (float2)((vp.s2-vp.s0) + vp.s0, 
 				   (vp.s3-vp.s1) + vp.s1);
 }
 #define PARTICLE_WIDTH  32.0f
 #define PARTICLE_HEIGHT 32.0f
 // Unoptimized triangle rasterizer function
 // Details of the algorithm can be found here:
 //		http://www.devmaster.net/forums/showthread.php?t=1884
 //	
 void
 rasterizerUnOpt(
    __global struct __GSSpriteOut * outputPrimitives,
 //	 __global float4 * outputPrimitives,
 	__constant int4  viewport[1],
 	__write_only image2d_t screen,
 	__read_only image2d_t particle,
 	uint v1Offset,
 	uint v2Offset,
 	uint v3Offset,
 	__global float4 * debugOut1)
 {
 	sampler_t sampler = 
 		CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 	uint id = get_global_id(0);
 	struct __GSSpriteOut output;
 	float2 v1, v2, v3;
 	float2 uv1, uv2, uv3;
 	output = outputPrimitives[id*4+v1Offset];
 	uv1    = output.textureUV;
 	viewportTransform(output.position, viewport, &v1);
 	output = outputPrimitives[id*4+v2Offset];
 	uv2    = output.textureUV;
 	viewportTransform(output.position, viewport, &v2);
 	output = outputPrimitives[id*4+v3Offset];
 	uv3    = output.textureUV;
 	viewportTransform(output.position, viewport, &v3);
 	// Bounding rectangle
 	int2 min_ = convert_int2(min(v1, min(v2, v3)));
 	int2 max_ = convert_int2(max(v1, max(v2, v3)));
 	// naive bi-linear interploation for texture coords, note this is 
 	// broken with respect to OpenGL and needs to be fixed for the 
 	// general case.
 	float p1x = v2.x - v1.x;
 	float p1y = v2.y - v1.y;
 	float p2x = v3.x - v1.x;
 	float p2y = v3.y - v1.y;
 	// Scan through bounding rectangle
 	for(int y = min_.y; y < max_.y; y++) {
 		for(int x = min_.x; x < max_.x; x++) {
 			// When all half-space functions positive, pixel is in triangle
 			if((v1.x - v2.x) * (y - v1.y) - (v1.y - v2.y) * (x - v1.x) > 0 &&
 			 (v2.x - v3.x) * (y - v2.y) - (v2.y - v3.y) * (x - v2.x) > 0 &&
 			 (v3.x - v1.x) * (y - v3.y) - (v3.y - v1.y) * (x - v3.x) > 0) {
 				float px = x - v1.x;
 				float py = y - v1.y;
 					write_imagef(
 						screen, 
 						(int2)(x,y), 
 					//	texel);
 						(float4)(1.0f,1.0f,1.0f,1.0f));
 			}
 		}
 	}
 }
 // Optimized rasterizer function
 // Details of the algorithm can be found here:
 //		http://www.devmaster.net/forums/showthread.php?t=1884
 //	
 // Currently has a bug, still work in progess
 __kernel void
 rasterizerXX(
    __global float4 * outputPrimitives,
 	__write_only image2d_t screen,
 	__global float4 * debugOut1,
 	__global int2 * debugOut2)
 {
 	uint id = get_global_id(0);
 //	printf("ras\n");
 	float4 v1 = outputPrimitives[id*4+0];
 	float4 v2 = outputPrimitives[id*4+1];
 	float4 v3 = outputPrimitives[id*4+2];
 	float y1 = 0.5f* (v1.y+1) * (T - B) + B;
 	float y2 = 0.5f* (v2.y+1) * (T - B) + B;
    float y3 = 0.5f* (v3.y+1) * (T - B) + B;
    float x1 = 0.5f * (v1.x+1) * (R - L) + L;
 	float x2 = 0.5f * (v2.x+1) * (R - L) + L;
 	float x3 = 0.5f * (v3.x+1) * (R - L) + L;
    const int Y1 = convert_int(shiftValue * y1);
    const int Y2 = convert_int(shiftValue * y2);
    const int Y3 = convert_int(shiftValue * y3);
    const int X1 = convert_int(shiftValue * x1);
    const int X2 = convert_int(shiftValue * x2);
    const int X3 = convert_int(shiftValue * x3);
 	debugOut1[id*4+0]   = v1;
 	debugOut1[id*4+1]   = v2;
 	debugOut1[id*4+2]   = v3;
 	debugOut2[id*3+0] = (int2)(X1, Y1);
 	debugOut2[id*3+1] = (int2)(X2, Y2);
 	debugOut2[id*3+2] = (int2)(X3, Y3);
    // Deltas
    const int DX12 = X1 - X2;
    const int DX23 = X2 - X3;
    const int DX31 = X3 - X1;
    const int DY12 = Y1 - Y2;
    const int DY23 = Y2 - Y3;
    const int DY31 = Y3 - Y1;
    // Fixed-point deltas
    const int FDX12 = DX12 << shiftNumber;
    const int FDX23 = DX23 << shiftNumber;
    const int FDX31 = DX31 << shiftNumber;
    const int FDY12 = DY12 << shiftNumber;
    const int FDY23 = DY23 << shiftNumber;
    const int FDY31 = DY31 << shiftNumber;
    // Bounding rectangle
    int minx = (min(X1, min(X2, X3)) + shiftMask) >> shiftNumber;
 	//minx = max(0,minx);
 	int maxx = (max(X1, min(X2, X3)) + shiftMask) >> shiftNumber;
 	//min(maxx , screenWidth1SubOne);
 	int miny = (min(Y1, min(Y2, Y3)) + shiftMask) >> shiftNumber;
    //max(0,miny);
 	int maxy = (max(Y1, min(Y2, Y3)) + shiftMask) >> shiftNumber;
 	//min(maxy , screenHeight1SubOne);
    //(char*&)colorBuffer += miny * stride;
 	int offset = miny * stride;
    // Half-edge constants
    int C1 = DY12 * X1 - DX12 * Y1;
    int C2 = DY23 * X2 - DX23 * Y2;
    int C3 = DY31 * X3 - DX31 * Y3;
    // Correct for fill convention
    if(DY12 < 0 || (DY12 == 0 && DX12 > 0)) C1++;
    if(DY23 < 0 || (DY23 == 0 && DX23 > 0)) C2++;
    if(DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++;
    int CY1 = C1 + DX12 * (miny << shiftNumber) - DY12 * (minx << shiftNumber);
    int CY2 = C2 + DX23 * (miny << shiftNumber) - DY23 * (minx << shiftNumber);
    int CY3 = C3 + DX31 * (miny << shiftNumber) - DY31 * (minx << shiftNumber);
    for(int y = miny; y < maxy; y++) {
        int CX1 = CY1;
        int CX2 = CY2;
        int CX3 = CY3;
 		debugOut2[id*3+0] = (int2)(minx, maxx);
        for(int x = minx; x < maxx; x++) {
 			debugOut2[id*3+0] = (int2)(CX1, CX2);
            if(CX1 > 0 && CX2 > 0 && CX3 > 0) {
 				debugOut2[id*3+0] = (int2)(1, 1);
 				write_imagef(
 					screen, 
 					(int2)(x,y), 
 					(float4)(1.0f,1.0f,1.0f,1.0f));
           }
            CX1 -= FDY12;
            CX2 -= FDY23;
            CX3 -= FDY31;
        }
        CY1 += FDX12;
        CY2 += FDX23;
        CY3 += FDX31;
        //(char*&)colorBuffer += stride;
 		offset += stride;
    }
 }
 //------------------------------------------------------------------------------
 void geometryShader(
    __constant float modelview[16],
 	__constant float projection[16],
 	__constant float inverseView[9],
 	__constant int4  viewport[1],
 	__local struct __VSSpriteOut  * vsOutputPrimitives,
 	__global struct __GSSpriteOut * outputPrimitives,
 //	 __global float4 * outputPrimitives,
 	__write_only image2d_t screen,
 	__read_only image2d_t particle,
 	__global float4 * debugOut1,
 	__global int * debugOut2)
 {
 	float2 texcoords[4] = 
 	{ 
 		(float2)(0.0f,0.0f), 
 		(float2)(1.0f,0.0f),
 		(float2)(0.0f,1.0f),
 		(float2)(1.0f,1.0f)
 	};
 	float matrix[16];
 	uint id  = get_global_id(0);
 	uint lid = get_local_id(0);
 	float4 vsPosition = vsOutputPrimitives[lid].position;
 	matrixMul(matrix, projection, modelview);
 	//
 	// Emit two new triangles
 	//
 	for (uint i = 0; i<4; i++) {
 		float3 position = g_positions[i] * PARTICLE_RADIUS;
 		position        = matrixVector3Mul(inverseView, position) + vsPosition;
 		float3 particlePosition = 
 			matrixVector3Mul( 
 				inverseView, 
 				(float4)(0.0f,0.0f,0.0f,0.0f)) + vsPosition;	// world space
 		// Compute view space position
 		position.w               = 1.0f;
 		position                 = matrixVectorMul(matrix, position);
 		//perspective division
 		position /= position.w;
 		struct __GSSpriteOut output;
 		output.position  = position;
 		//output.textureUV = g_texcoords[i];
 		output.textureUV = texcoords[i];
 		outputPrimitives[id*4+i] = output; 
 	}	
 	// Render QUAD - Triangle 1
 	rasterizerUnOpt(
 		outputPrimitives,
 		viewport,
 		screen,
 		particle,
 		0,
 		1,
 		2,
 		debugOut1);
 	// Render QUAD - Triangle 2
 	rasterizerUnOpt(
 		outputPrimitives,
 		viewport,
 		screen,
 		particle,
 		2,
 		1,
 		3,
 		debugOut1);
 }
 __kernel void vertexShaderSprite(
    __constant float modelview[16],
 	__constant float projection[16],
 	__constant float inverseView[9],
 	__constant int4  viewport[1],
 	__local struct __VSSpriteOut  * vsOutputPrimitives,
 	__global float4               * inputPrimitives, 	
 	__global struct __GSSpriteOut * outputPrimitives,
 //	__global float4 * outputPrimitives,
 	__write_only image2d_t screen,
 	__read_only image2d_t particle,
 	__global float4 * debugOut1,
 	__global int * debugOut2)
 {
 	float matrix[16];
 	uint id  = get_global_id(0);
 	uint lid = get_local_id(0);
 	// gl_ProjectionMatrix * gl_ModelViewMatrix * gl_Vertex
 	matrixMul(matrix, projection, modelview);
 	float4 position                          = inputPrimitives[id];
 	vsOutputPrimitives[lid].position         = position;
    vsOutputPrimitives[lid].particlePosition = 
 		matrixVectorMul(matrix, position); 
 	geometryShader(
 		modelview, 
 		projection, 
 		inverseView, 
 		viewport,
 		vsOutputPrimitives, 
 		outputPrimitives,
 		screen,
 		particle,
 		debugOut1,
 		debugOut2);
 }
--- a/Demos/OpenCLClothDemo/texture1.bmp
+++ b/Demos/OpenCLClothDemo/texture1.bmp
--- a/Demos/OpenCLClothDemo/vertex.glsl
+++ b/Demos/OpenCLClothDemo/vertex.glsl
@@ -0,0 +1,7 @@
 void main()
 {
 	//gl_Position = gl_ProjectionMatrix * gl_ModelViewMatrix * gl_Vertex;
 	gl_TexCoord[0] = gl_MultiTexCoord0;
    gl_Position    = gl_Vertex;
 }
--- a/Demos/ParticlesOpenCL/AMD/CMakeLists.txt
+++ b/Demos/ParticlesOpenCL/AMD/CMakeLists.txt
@@ -7,6 +7,8 @@ ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL
 )
 ADD_DEFINITIONS(-DUSE_AMD_OPENCL)
 ADD_DEFINITIONS(-DCL_PLATFORM_AMD)
 IF (INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 	INCLUDE_DIRECTORIES(		$ENV{==ATISTREAMSDKROOT=}/include )
@@ -53,15 +55,17 @@ IF (USE_GLUT)
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/btParticlesSharedDefs.h
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/btParticlesSharedTypes.h
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesDemo.h
-					${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.h
+					
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/shaders.h	
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.h
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.h
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclUtils.cpp
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.cpp
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/main.cpp
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesDemo.cpp
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/shaders.cpp
-					${BULLET_PHYSICS_SOURCE_DIR}/Demos/SharedOpenCL/btOclCommon.cpp
+					
 					${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesOCL.cl
 	)
 ELSE (USE_GLUT)
--- a/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp
+++ b/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp
@@ -329,7 +329,9 @@ void btParticlesDynamicsWorld::initCLKernels(int argc, char** argv)
 	if (!m_cxMainContext)
 	{
 //		m_cxMainContext = clCreateContextFromType(0, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErrNum);
-		m_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum);
+
 		m_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum);
 		//m_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum);
 		oclCHECKERROR(ciErrNum, CL_SUCCESS);
 		m_cdDevice = btOclGetMaxFlopsDev(m_cxMainContext);
--- a/Demos/SharedOpenCL/btOclCommon.cpp
+++ b/Demos/SharedOpenCL/btOclCommon.cpp
@@ -85,7 +85,7 @@ cl_context btOclCommon::createContextFromType(cl_device_type deviceType, cl_int*
 	/* Use NULL for backward compatibility */    
 	cl_context_properties* cprops = (NULL == platform) ? NULL : cps;
    cl_context retContext = clCreateContextFromType(cprops, 
-													CL_DEVICE_TYPE_ALL,                  
+													deviceType,                  
 													NULL,                  
 													NULL,                  
 													&ciErrNum);
--- a/Demos/SharedOpenCL/btOclUtils.cpp
+++ b/Demos/SharedOpenCL/btOclUtils.cpp
@@ -1,3 +1,18 @@
 /*
 Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
 Copyright (C) 2006 - 2010 Sony Computer Entertainment Inc. 
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
--- a/Demos/SharedOpenCL/btOclUtils.h
+++ b/Demos/SharedOpenCL/btOclUtils.h
@@ -1,3 +1,17 @@
 /*
 Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
 Copyright (C) 2006 - 2010 Sony Computer Entertainment Inc. 
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #ifndef BT_OCL_UTILS_H
 #define BT_OCL_UTILS_H
--- a/Demos/VectorAdd_OpenCL/VectorAddKernels.cl
+++ b/Demos/VectorAdd_OpenCL/VectorAddKernels.cl
@@ -1,13 +1,4 @@
 #ifndef GUID_ARG
 #define GUID_ARG
 #endif
 #ifndef MSTRINGIFY
 #define MSTRINGIFY(A) A
 #endif
 MSTRINGIFY(
--- a/src/BulletMultiThreaded/CMakeLists.txt
+++ b/src/BulletMultiThreaded/CMakeLists.txt
@@ -67,10 +67,8 @@ ADD_LIBRARY(BulletMultiThreaded
 )
-#for now, only Direct 11 (Direct Compute)
+
-IF(USE_DX11)
+SUBDIRS(GpuSoftBodySolvers)
 	SUBDIRS(GpuSoftBodySolvers)
 ENDIF(USE_DX11)
 IF (BUILD_SHARED_LIBS)
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/CMakeLists.txt
@@ -3,20 +3,12 @@ INCLUDE_DIRECTORIES(
 ${BULLET_PHYSICS_SOURCE_DIR}/src
 )
 LIST(APPEND SubDirList "CPU")
 SUBDIRS ( 
 	OpenCL
 	CPU 
 )
-# Configure use of OpenCL and DX11
+IF( USE_DX11 )
-# Generates the settings file and defines libraries and include paths
+	SUBDIRS( DX11 )
-OPTION(USE_OPENCL "Use OpenCL"	OFF)
+ENDIF( USE_DX11 )
 if( USE_OPENCL )
 	LIST(APPEND SubDirList "OpenCL")
 endif( USE_OPENCL )
 if( USE_DX11 )
 	LIST(APPEND SubDirList "DX11")
 endif( USE_DX11 )
 SUBDIRS( ${SubDirList} )
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/CMakeLists.txt
@@ -14,14 +14,17 @@ ${VECTOR_MATH_INCLUDE}
 SET(BulletSoftBodyDX11Solvers_SRCS
 	btSoftBodySolver_DX11.cpp
 	btSoftBodySolver_DX11SIMDAware.cpp
 )
 SET(BulletSoftBodyDX11Solvers_HDRS
 	btSoftBodySolver_DX11.h
 	btSoftBodySolver_DX11SIMDAware.h
 	../cpu/btSoftBodySolverData.h
 	btSoftBodySolverVertexData_DX11.h
 	btSoftBodySolverTriangleData_DX11.h
 	btSoftBodySolverLinkData_DX11.h
 	btSoftBodySolverLinkData_DX11SIMDAware.h
 	btSoftBodySolverBuffer_DX11.h
 	btSoftBodySolverVertexBuffer_DX11.h
@@ -37,6 +40,7 @@ SET(BulletSoftBodyDX11Solvers_Shaders
 	UpdatePositions
 	UpdateNodes
 	SolvePositions
 	SolvePositionsSIMDBatched
 	UpdatePositionsFromVelocities
 	ApplyForces
 	PrepareLinks
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositionsSIMDBatched.hlsl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositionsSIMDBatched.hlsl
@@ -0,0 +1,128 @@
 MSTRINGIFY(
 cbuffer SolvePositionsFromLinksKernelCB : register( b0 )
 {
 	int startWaveInBatch;
 	int numWaves;
 	float kst;		
 	float ti;
 };
 // Number of batches per wavefront stored one element per logical wavefront
 StructuredBuffer<int2> g_wavefrontBatchCountsVertexCounts : register( t0 );
 // Set of up to maxNumVertices vertex addresses per wavefront
 StructuredBuffer<int> g_vertexAddressesPerWavefront : register( t1 );
 StructuredBuffer<float> g_verticesInverseMass : register( t2 );
 // Per-link data layed out structured in terms of sub batches within wavefronts
 StructuredBuffer<int2> g_linksVertexIndices : register( t3 );
 StructuredBuffer<float> g_linksMassLSC : register( t4 );
 StructuredBuffer<float> g_linksRestLengthSquared : register( t5 );
 RWStructuredBuffer<float4> g_vertexPositions : register( u0 );
 // Data loaded on a per-wave basis
 groupshared int2 wavefrontBatchCountsVertexCounts[WAVEFRONT_BLOCK_MULTIPLIER];
 groupshared float4 vertexPositionSharedData[MAX_NUM_VERTICES_PER_WAVE*WAVEFRONT_BLOCK_MULTIPLIER];
 groupshared float vertexInverseMassSharedData[MAX_NUM_VERTICES_PER_WAVE*WAVEFRONT_BLOCK_MULTIPLIER];
 // Storing the vertex addresses actually slowed things down a little
 //groupshared int vertexAddressSharedData[MAX_NUM_VERTICES_PER_WAVE*WAVEFRONT_BLOCK_MULTIPLIER];
 [numthreads(BLOCK_SIZE, 1, 1)]
 void 
 SolvePositionsFromLinksKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
 {
 	const int laneInWavefront = (DTid.x & (WAVEFRONT_SIZE-1));
 	const int wavefront = startWaveInBatch + (DTid.x / WAVEFRONT_SIZE);
 	const int firstWavefrontInBlock = startWaveInBatch + Gid.x * WAVEFRONT_BLOCK_MULTIPLIER;
 	const int localWavefront = wavefront - firstWavefrontInBlock;
 	// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier	
 	if( wavefront < (startWaveInBatch + numWaves) )
 	{
 		// Load the batch counts for the wavefronts
 		// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier
 		if( laneInWavefront == 0 )
 		{
 			int2 batchesAndVertexCountsWithinWavefront = g_wavefrontBatchCountsVertexCounts[firstWavefrontInBlock + localWavefront];
 			wavefrontBatchCountsVertexCounts[localWavefront] = batchesAndVertexCountsWithinWavefront;
 		}
 		int2 batchesAndVerticesWithinWavefront = wavefrontBatchCountsVertexCounts[localWavefront];
 		int batchesWithinWavefront = batchesAndVerticesWithinWavefront.x;
 		int verticesUsedByWave = batchesAndVerticesWithinWavefront.y;
 		// Load the vertices for the wavefronts
 		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
 		{
 			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
 			//vertexAddressSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = vertexAddress;
 			vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_vertexPositions[vertexAddress];
 			vertexInverseMassSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_verticesInverseMass[vertexAddress];
 		}
 		// Loop through the batches performing the solve on each in LDS
 		int baseDataLocationForWave = WAVEFRONT_SIZE * wavefront * MAX_BATCHES_PER_WAVE;	
 		//for( int batch = 0; batch < batchesWithinWavefront; ++batch )
 		int batch = 0;
 		do
 		{
 			int baseDataLocation = baseDataLocationForWave + WAVEFRONT_SIZE * batch;
 			int locationOfValue = baseDataLocation + laneInWavefront;
 			// These loads should all be perfectly linear across the WF
 			int2 localVertexIndices = g_linksVertexIndices[locationOfValue];
 			float massLSC = g_linksMassLSC[locationOfValue];
 			float restLengthSquared = g_linksRestLengthSquared[locationOfValue];
 			// LDS vertex addresses based on logical wavefront number in block and loaded index
 			int vertexAddress0 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.x;
 			int vertexAddress1 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.y;
 			float3 position0 = vertexPositionSharedData[vertexAddress0].xyz;
 			float3 position1 = vertexPositionSharedData[vertexAddress1].xyz;
 			float inverseMass0 = vertexInverseMassSharedData[vertexAddress0];
 			float inverseMass1 = vertexInverseMassSharedData[vertexAddress1]; 
 			float3 del = position1 - position0;
 			float len = dot(del, del);
 			float k = 0;
 			if( massLSC > 0.0f )
 			{		
 				k = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
 			}
 			position0 = position0 - del*(k*inverseMass0);
 			position1 = position1 + del*(k*inverseMass1);
 			vertexPositionSharedData[vertexAddress0] = float4(position0, 0.f);
 			vertexPositionSharedData[vertexAddress1] = float4(position1, 0.f);
 			++batch;
 		} while( batch < batchesWithinWavefront );
 		// Update the global memory vertices for the wavefronts
 		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
 		{
 			int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
 			g_vertexPositions[vertexAddress] = vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex];
 		}
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11SIMDAware.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolverLinkData_DX11SIMDAware.h
@@ -0,0 +1,173 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h"
 #include "btSoftBodySolverBuffer_DX11.h"
 #ifndef BT_ACCELERATED_SOFT_BODY_LINK_DATA_DX11_SIMDAWARE_H
 #define BT_ACCELERATED_SOFT_BODY_LINK_DATA_DX11_SIMDAWARE_H
 struct ID3D11Device;
 struct ID3D11DeviceContext;
 class btSoftBodyLinkDataDX11SIMDAware : public btSoftBodyLinkData
 {
 public:
 	bool				m_onGPU;
 	ID3D11Device		*m_d3dDevice;
 	ID3D11DeviceContext *m_d3dDeviceContext;
 	const int m_wavefrontSize;
 	const int m_linksPerWorkItem;
 	const int m_maxLinksPerWavefront;
 	int m_maxBatchesWithinWave;
 	int m_maxVerticesWithinWave;
 	int m_numWavefronts;
 	int m_maxVertex;
 	struct NumBatchesVerticesPair
 	{
 		int numBatches;
 		int numVertices;
 	};
 	// Array storing number of links in each wavefront
 	btAlignedObjectArray<int>									m_linksPerWavefront;
 	btAlignedObjectArray<NumBatchesVerticesPair>				m_numBatchesAndVerticesWithinWaves;
 	btDX11Buffer< NumBatchesVerticesPair >						m_dx11NumBatchesAndVerticesWithinWaves;
 	// All arrays here will contain batches of m_maxLinksPerWavefront links
 	// ordered by wavefront.
 	// with either global vertex pairs or local vertex pairs
 	btAlignedObjectArray< int >									m_wavefrontVerticesGlobalAddresses; // List of global vertices per wavefront
 	btDX11Buffer<int>											m_dx11WavefrontVerticesGlobalAddresses;
 	btAlignedObjectArray< LinkNodePair >						m_linkVerticesLocalAddresses; // Vertex pair for the link
 	btDX11Buffer<LinkNodePair>									m_dx11LinkVerticesLocalAddresses;
 	btDX11Buffer<float>											m_dx11LinkStrength;
 	btDX11Buffer<float>											m_dx11LinksMassLSC;
 	btDX11Buffer<float>											m_dx11LinksRestLengthSquared;
 	btDX11Buffer<float>											m_dx11LinksRestLength;
 	btDX11Buffer<float>											m_dx11LinksMaterialLinearStiffnessCoefficient;
 	struct BatchPair
 	{
 		int start;
 		int length;
 		BatchPair() :
 			start(0),
 			length(0)
 		{
 		}
 		BatchPair( int s, int l ) : 
 			start( s ),
 			length( l )
 		{
 		}
 	};
 	/**
 	 * Link addressing information for each cloth.
 	 * Allows link locations to be computed independently of data batching.
 	 */
 	btAlignedObjectArray< int >							m_linkAddresses;
 	/**
 	 * Start and length values for computation batches over link data.
 	 */
 	btAlignedObjectArray< BatchPair >		m_wavefrontBatchStartLengths;
 	//ID3D11Buffer*               readBackBuffer;
 	btSoftBodyLinkDataDX11SIMDAware( ID3D11Device *d3dDevice, ID3D11DeviceContext *d3dDeviceContext );
 	virtual ~btSoftBodyLinkDataDX11SIMDAware();
 	/** Allocate enough space in all link-related arrays to fit numLinks links */
 	virtual void createLinks( int numLinks );
 	/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
 	virtual void setLinkAt( const LinkDescription &link, int linkIndex );
 	virtual bool onAccelerator();
 	virtual bool moveToAccelerator();
 	virtual bool moveFromAccelerator();
 	/**
 	 * Generate (and later update) the batching for the entire link set.
 	 * This redoes a lot of work because it batches the entire set when each cloth is inserted.
 	 * In theory we could delay it until just before we need the cloth.
 	 * It's a one-off overhead, though, so that is a later optimisation.
 	 */
 	void generateBatches();
 	int getMaxVerticesPerWavefront()
 	{
 		return m_maxVerticesWithinWave;
 	}
 	int getWavefrontSize()
 	{
 		return m_wavefrontSize;
 	}
 	int getLinksPerWorkItem()
 	{
 		return m_linksPerWorkItem;
 	}
 	int getMaxLinksPerWavefront()
 	{
 		return m_maxLinksPerWavefront;
 	}
 	int getMaxBatchesPerWavefront()
 	{
 		return m_maxBatchesWithinWave;
 	}
 	int getNumWavefronts()
 	{
 		return m_numWavefronts;
 	}
 	NumBatchesVerticesPair getNumBatchesAndVerticesWithinWavefront( int wavefront )
 	{
 		return m_numBatchesAndVerticesWithinWaves[wavefront];
 	}
 	int getVertexGlobalAddresses( int vertexIndex )
 	{
 		return m_wavefrontVerticesGlobalAddresses[vertexIndex];
 	}
 	/**
 	 * Get post-batching local addresses of the vertex pair for a link assuming all vertices used by a wavefront are loaded locally.
 	 */
 	LinkNodePair getVertexPairLocalAddresses( int linkIndex )
 	{
 		return m_linkVerticesLocalAddresses[linkIndex];
 	}
 };
 #endif // #ifndef BT_ACCELERATED_SOFT_BODY_LINK_DATA_DX11_SIMDAWARE_H
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.cpp
@@ -622,7 +622,7 @@ void btDX11SoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &softB
 			using Vectormath::Aos::Point3;
 			// Create SoftBody that will store the information within the solver
-			btAcceleratedSoftBodyInterface *newSoftBody = new btAcceleratedSoftBodyInterface( softBody );
+			btDX11AcceleratedSoftBodyInterface *newSoftBody = new btDX11AcceleratedSoftBodyInterface( softBody );
 			m_softBodySet.push_back( newSoftBody );
 			m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) );
@@ -1451,11 +1451,11 @@ void btDX11SoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities( float
-btDX11SoftBodySolver::btAcceleratedSoftBodyInterface *btDX11SoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
+btDX11AcceleratedSoftBodyInterface *btDX11SoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
 {
 	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
 	{
-		btAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
+		btDX11AcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
 		if( softBodyInterface->getSoftBody() == softBody )
 			return softBodyInterface;
 	}
@@ -1466,7 +1466,7 @@ void btDX11SoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * const
 {
 	checkInitialized();
-	btAcceleratedSoftBodyInterface *currentCloth = findSoftBodyInterface( softBody );
+	btDX11AcceleratedSoftBodyInterface *currentCloth = findSoftBodyInterface( softBody );
 	const int firstVertex = currentCloth->getFirstVertex();
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11.h
@@ -13,6 +13,9 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 #ifndef BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
 #define BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
 #include "vectormath/vmInclude.h"
 #include "BulletSoftBody/btSoftBodySolvers.h"
@@ -22,185 +25,184 @@ subject to the following restrictions:
 #include "btSoftBodySolverTriangleData_DX11.h"
-#ifndef BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
+
-#define BT_ACCELERATED_SOFT_BODY_DX11_SOLVER_H
+/**
 * SoftBody class to maintain information about a soft body instance
 * within a solver.
 * This data addresses the main solver arrays.
 */
 class btDX11AcceleratedSoftBodyInterface
 {
 protected:
 	/** Current number of vertices that are part of this cloth */
 	int m_numVertices;
 	/** Maximum number of vertices allocated to be part of this cloth */
 	int m_maxVertices;
 	/** Current number of triangles that are part of this cloth */
 	int m_numTriangles;
 	/** Maximum number of triangles allocated to be part of this cloth */
 	int m_maxTriangles;
 	/** Index of first vertex in the world allocated to this cloth */
 	int m_firstVertex;
 	/** Index of first triangle in the world allocated to this cloth */
 	int m_firstTriangle;
 	/** Index of first link in the world allocated to this cloth */
 	int m_firstLink;
 	/** Maximum number of links allocated to this cloth */
 	int m_maxLinks;
 	/** Current number of links allocated to this cloth */
 	int m_numLinks;
 	/** The actual soft body this data represents */
 	btSoftBody *m_softBody;
 public:
 	btDX11AcceleratedSoftBodyInterface( btSoftBody *softBody ) :
 	  m_softBody( softBody )
 	{
 		m_numVertices = 0;
 		m_maxVertices = 0;
 		m_numTriangles = 0;
 		m_maxTriangles = 0;
 		m_firstVertex = 0;
 		m_firstTriangle = 0;
 		m_firstLink = 0;
 		m_maxLinks = 0;
 		m_numLinks = 0;
 	}
 	int getNumVertices()
 	{
 		return m_numVertices;
 	}
 	int getNumTriangles()
 	{
 		return m_numTriangles;
 	}
 	int getMaxVertices()
 	{
 		return m_maxVertices;
 	}
 	int getMaxTriangles()
 	{
 		return m_maxTriangles;
 	}
 	int getFirstVertex()
 	{
 		return m_firstVertex;
 	}
 	int getFirstTriangle()
 	{
 		return m_firstTriangle;
 	}
 	// TODO: All of these set functions will have to do checks and
 	// update the world because restructuring of the arrays will be necessary
 	// Reasonable use of "friend"?
 	void setNumVertices( int numVertices )
 	{
 		m_numVertices = numVertices;
 	}	
 	void setNumTriangles( int numTriangles )
 	{
 		m_numTriangles = numTriangles;
 	}
 	void setMaxVertices( int maxVertices )
 	{
 		m_maxVertices = maxVertices;
 	}
 	void setMaxTriangles( int maxTriangles )
 	{
 		m_maxTriangles = maxTriangles;
 	}
 	void setFirstVertex( int firstVertex )
 	{
 		m_firstVertex = firstVertex;
 	}
 	void setFirstTriangle( int firstTriangle )
 	{
 		m_firstTriangle = firstTriangle;
 	}
 	void setMaxLinks( int maxLinks )
 	{
 		m_maxLinks = maxLinks;
 	}
 	void setNumLinks( int numLinks )
 	{
 		m_numLinks = numLinks;
 	}
 	void setFirstLink( int firstLink )
 	{
 		m_firstLink = firstLink;
 	}
 	int getMaxLinks()
 	{
 		return m_maxLinks;
 	}
 	int getNumLinks()
 	{
 		return m_numLinks;
 	}
 	int getFirstLink()
 	{
 		return m_firstLink;
 	}
 	btSoftBody* getSoftBody()
 	{
 		return m_softBody;
 	}
 #if 0
 	void setAcceleration( Vectormath::Aos::Vector3 acceleration )
 	{
 		m_currentSolver->setPerClothAcceleration( m_clothIdentifier, acceleration );
 	}
 	void setWindVelocity( Vectormath::Aos::Vector3 windVelocity )
 	{
 		m_currentSolver->setPerClothWindVelocity( m_clothIdentifier, windVelocity );
 	}
 	/** 
 	 * Set the density of the air in which the cloth is situated.
 	 */
 	void setAirDensity( btScalar density )
 	{
 		m_currentSolver->setPerClothMediumDensity( m_clothIdentifier, static_cast<float>(density) );
 	}
 	/**
 	 * Add a collision object to this soft body.
 	 */
 	void addCollisionObject( btCollisionObject *collisionObject )
 	{
 		m_currentSolver->addCollisionObjectForSoftBody( m_clothIdentifier, collisionObject );
 	}
 #endif
 };
 class btDX11SoftBodySolver : public btSoftBodySolver
 {
 public:
 	/**
 	 * SoftBody class to maintain information about a soft body instance
 	 * within a solver.
 	 * This data addresses the main solver arrays.
 	 */
 	class btAcceleratedSoftBodyInterface
 	{
 	protected:
 		/** Current number of vertices that are part of this cloth */
 		int m_numVertices;
 		/** Maximum number of vertices allocated to be part of this cloth */
 		int m_maxVertices;
 		/** Current number of triangles that are part of this cloth */
 		int m_numTriangles;
 		/** Maximum number of triangles allocated to be part of this cloth */
 		int m_maxTriangles;
 		/** Index of first vertex in the world allocated to this cloth */
 		int m_firstVertex;
 		/** Index of first triangle in the world allocated to this cloth */
 		int m_firstTriangle;
 		/** Index of first link in the world allocated to this cloth */
 		int m_firstLink;
 		/** Maximum number of links allocated to this cloth */
 		int m_maxLinks;
 		/** Current number of links allocated to this cloth */
 		int m_numLinks;
 		/** The actual soft body this data represents */
 		btSoftBody *m_softBody;
 	public:
 		btAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
 		  m_softBody( softBody )
 		{
 			m_numVertices = 0;
 			m_maxVertices = 0;
 			m_numTriangles = 0;
 			m_maxTriangles = 0;
 			m_firstVertex = 0;
 			m_firstTriangle = 0;
 			m_firstLink = 0;
 			m_maxLinks = 0;
 			m_numLinks = 0;
 		}
 		int getNumVertices()
 		{
 			return m_numVertices;
 		}
 		int getNumTriangles()
 		{
 			return m_numTriangles;
 		}
 		int getMaxVertices()
 		{
 			return m_maxVertices;
 		}
 		int getMaxTriangles()
 		{
 			return m_maxTriangles;
 		}
 		int getFirstVertex()
 		{
 			return m_firstVertex;
 		}
 		int getFirstTriangle()
 		{
 			return m_firstTriangle;
 		}
 		// TODO: All of these set functions will have to do checks and
 		// update the world because restructuring of the arrays will be necessary
 		// Reasonable use of "friend"?
 		void setNumVertices( int numVertices )
 		{
 			m_numVertices = numVertices;
 		}	
 		void setNumTriangles( int numTriangles )
 		{
 			m_numTriangles = numTriangles;
 		}
 		void setMaxVertices( int maxVertices )
 		{
 			m_maxVertices = maxVertices;
 		}
 		void setMaxTriangles( int maxTriangles )
 		{
 			m_maxTriangles = maxTriangles;
 		}
 		void setFirstVertex( int firstVertex )
 		{
 			m_firstVertex = firstVertex;
 		}
 		void setFirstTriangle( int firstTriangle )
 		{
 			m_firstTriangle = firstTriangle;
 		}
 		void setMaxLinks( int maxLinks )
 		{
 			m_maxLinks = maxLinks;
 		}
 		void setNumLinks( int numLinks )
 		{
 			m_numLinks = numLinks;
 		}
 		void setFirstLink( int firstLink )
 		{
 			m_firstLink = firstLink;
 		}
 		int getMaxLinks()
 		{
 			return m_maxLinks;
 		}
 		int getNumLinks()
 		{
 			return m_numLinks;
 		}
 		int getFirstLink()
 		{
 			return m_firstLink;
 		}
 		btSoftBody* getSoftBody()
 		{
 			return m_softBody;
 		}
 	#if 0
 		void setAcceleration( Vectormath::Aos::Vector3 acceleration )
 		{
 			m_currentSolver->setPerClothAcceleration( m_clothIdentifier, acceleration );
 		}
 		void setWindVelocity( Vectormath::Aos::Vector3 windVelocity )
 		{
 			m_currentSolver->setPerClothWindVelocity( m_clothIdentifier, windVelocity );
 		}
 		/** 
 		 * Set the density of the air in which the cloth is situated.
 		 */
 		void setAirDensity( btScalar density )
 		{
 			m_currentSolver->setPerClothMediumDensity( m_clothIdentifier, static_cast<float>(density) );
 		}
 		/**
 		 * Add a collision object to this soft body.
 		 */
 		void addCollisionObject( btCollisionObject *collisionObject )
 		{
 			m_currentSolver->addCollisionObjectForSoftBody( m_clothIdentifier, collisionObject );
 		}
 	#endif
 	};
 	class KernelDesc
 	{
@@ -344,7 +346,7 @@ private:
 	 * Cloths owned by this solver.
 	 * Only our cloths are in this array.
 	 */
-	btAlignedObjectArray< btAcceleratedSoftBodyInterface * > m_softBodySet;
+	btAlignedObjectArray< btDX11AcceleratedSoftBodyInterface * > m_softBodySet;
 	/** Acceleration value to be applied to all non-static vertices in the solver. 
 	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
@@ -429,7 +431,7 @@ private:
 	void updateConstants( float timeStep );
-	btAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
+	btDX11AcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
 	//////////////////////////////////////
 	// Kernel dispatches
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.h
@@ -0,0 +1,432 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #include "vectormath/vmInclude.h"
 #include "BulletSoftBody/btSoftBodySolvers.h"
 #include "btSoftBodySolverVertexBuffer_DX11.h"
 #include "btSoftBodySolverLinkData_DX11SIMDAware.h"
 #include "btSoftBodySolverVertexData_DX11.h"
 #include "btSoftBodySolverTriangleData_DX11.h"
 #ifndef BT_SOFT_BODY_DX11_SOLVER_SIMDAWARE_H
 #define BT_SOFT_BODY_DX11_SOLVER_SIMDAWARE_H
 class btDX11SIMDAwareSoftBodySolver : public btSoftBodySolver
 {
 public:
 		/**
 	 * SoftBody class to maintain information about a soft body instance
 	 * within a solver.
 	 * This data addresses the main solver arrays.
 	 */
 	class btAcceleratedSoftBodyInterface
 	{
 	protected:
 		/** Current number of vertices that are part of this cloth */
 		int m_numVertices;
 		/** Maximum number of vertices allocated to be part of this cloth */
 		int m_maxVertices;
 		/** Current number of triangles that are part of this cloth */
 		int m_numTriangles;
 		/** Maximum number of triangles allocated to be part of this cloth */
 		int m_maxTriangles;
 		/** Index of first vertex in the world allocated to this cloth */
 		int m_firstVertex;
 		/** Index of first triangle in the world allocated to this cloth */
 		int m_firstTriangle;
 		/** Index of first link in the world allocated to this cloth */
 		int m_firstLink;
 		/** Maximum number of links allocated to this cloth */
 		int m_maxLinks;
 		/** Current number of links allocated to this cloth */
 		int m_numLinks;
 		/** The actual soft body this data represents */
 		btSoftBody *m_softBody;
 	public:
 		btAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
 		  m_softBody( softBody )
 		{
 			m_numVertices = 0;
 			m_maxVertices = 0;
 			m_numTriangles = 0;
 			m_maxTriangles = 0;
 			m_firstVertex = 0;
 			m_firstTriangle = 0;
 			m_firstLink = 0;
 			m_maxLinks = 0;
 			m_numLinks = 0;
 		}
 		int getNumVertices()
 		{
 			return m_numVertices;
 		}
 		int getNumTriangles()
 		{
 			return m_numTriangles;
 		}
 		int getMaxVertices()
 		{
 			return m_maxVertices;
 		}
 		int getMaxTriangles()
 		{
 			return m_maxTriangles;
 		}
 		int getFirstVertex()
 		{
 			return m_firstVertex;
 		}
 		int getFirstTriangle()
 		{
 			return m_firstTriangle;
 		}
 		void setNumVertices( int numVertices )
 		{
 			m_numVertices = numVertices;
 		}	
 		void setNumTriangles( int numTriangles )
 		{
 			m_numTriangles = numTriangles;
 		}
 		void setMaxVertices( int maxVertices )
 		{
 			m_maxVertices = maxVertices;
 		}
 		void setMaxTriangles( int maxTriangles )
 		{
 			m_maxTriangles = maxTriangles;
 		}
 		void setFirstVertex( int firstVertex )
 		{
 			m_firstVertex = firstVertex;
 		}
 		void setFirstTriangle( int firstTriangle )
 		{
 			m_firstTriangle = firstTriangle;
 		}
 		void setMaxLinks( int maxLinks )
 		{
 			m_maxLinks = maxLinks;
 		}
 		void setNumLinks( int numLinks )
 		{
 			m_numLinks = numLinks;
 		}
 		void setFirstLink( int firstLink )
 		{
 			m_firstLink = firstLink;
 		}
 		int getMaxLinks()
 		{
 			return m_maxLinks;
 		}
 		int getNumLinks()
 		{
 			return m_numLinks;
 		}
 		int getFirstLink()
 		{
 			return m_firstLink;
 		}
 		btSoftBody* getSoftBody()
 		{
 			return m_softBody;
 		}
 	};
 	class KernelDesc
 	{
 	protected:
 	public:
 		ID3D11ComputeShader* kernel;
 		ID3D11Buffer* constBuffer;
 		KernelDesc()
 		{
 			kernel = 0;
 			constBuffer = 0;
 		}
 		virtual ~KernelDesc()
 		{
 			// TODO: this should probably destroy its kernel but we need to be careful
 			// in case KernelDescs are copied
 		}
 	}; 
 	struct SolvePositionsFromLinksKernelCB
 	{		
 		int startWave;
 		int numWaves;
 		float kst;
 		float ti;
 	};
 	struct IntegrateCB
 	{
 		int numNodes;
 		float solverdt;
 		int padding1;
 		int padding2;
 	};
 	struct UpdatePositionsFromVelocitiesCB
 	{
 		int numNodes;
 		float solverSDT;
 		int padding1;
 		int padding2;
 	};
 	struct UpdateVelocitiesFromPositionsWithoutVelocitiesCB
 	{
 		int numNodes;
 		float isolverdt;
 		int padding1;
 		int padding2;
 	};
 	struct UpdateVelocitiesFromPositionsWithVelocitiesCB
 	{
 		int numNodes;
 		float isolverdt;
 		int padding1;
 		int padding2;
 	};
 	struct UpdateSoftBodiesCB
 	{
 		int numNodes;
 		int startFace;
 		int numFaces;
 		float epsilon;
 	};
 	struct OutputToVertexArrayCB
 	{
 		int startNode;
 		int numNodes;
 		int positionOffset;
 		int positionStride;
 		int normalOffset;	
 		int normalStride;
 		int padding1;
 		int padding2;
 	};
 	struct ApplyForcesCB
 	{
 		unsigned int numNodes;
 		float solverdt;
 		float epsilon;
 		int padding3;
 	};
 	struct AddVelocityCB
 	{
 		int startNode;
 		int lastNode;
 		float velocityX;
 		float velocityY;
 		float velocityZ;
 		int padding1;
 		int padding2;
 		int padding3;
 	};
 private:
 	ID3D11Device *		 m_dx11Device;
 	ID3D11DeviceContext* m_dx11Context;
 	/** Link data for all cloths. Note that this will be sorted batch-wise for efficient computation and m_linkAddresses will maintain the addressing. */
 	btSoftBodyLinkDataDX11SIMDAware m_linkData;
 	btSoftBodyVertexDataDX11 m_vertexData;
 	btSoftBodyTriangleDataDX11 m_triangleData;
 	/** Variable to define whether we need to update solver constants on the next iteration */
 	bool m_updateSolverConstants;
 	bool m_shadersInitialized;
 	/** 
 	 * Cloths owned by this solver.
 	 * Only our cloths are in this array.
 	 */
 	btAlignedObjectArray< btAcceleratedSoftBodyInterface * > m_softBodySet;
 	/** Acceleration value to be applied to all non-static vertices in the solver. 
 	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
 	 */
 	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_perClothAcceleration;
 	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11PerClothAcceleration;
 	/** Wind velocity to be applied normal to all non-static vertices in the solver. 
 	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
 	 */
 	btAlignedObjectArray< Vectormath::Aos::Vector3 >	m_perClothWindVelocity;
 	btDX11Buffer<Vectormath::Aos::Vector3>				m_dx11PerClothWindVelocity;
 	/** Velocity damping factor */
 	btAlignedObjectArray< float >						m_perClothDampingFactor;
 	btDX11Buffer<float>									m_dx11PerClothDampingFactor;
 	/** Velocity correction coefficient */
 	btAlignedObjectArray< float >						m_perClothVelocityCorrectionCoefficient;
 	btDX11Buffer<float>									m_dx11PerClothVelocityCorrectionCoefficient;
 	/** Lift parameter for wind effect on cloth. */
 	btAlignedObjectArray< float >						m_perClothLiftFactor;
 	btDX11Buffer<float>									m_dx11PerClothLiftFactor;
 	/** Drag parameter for wind effect on cloth. */
 	btAlignedObjectArray< float >						m_perClothDragFactor;
 	btDX11Buffer<float>									m_dx11PerClothDragFactor;
 	/** Density of the medium in which each cloth sits */
 	btAlignedObjectArray< float >						m_perClothMediumDensity;
 	btDX11Buffer<float>									m_dx11PerClothMediumDensity;
 	KernelDesc		solvePositionsFromLinksKernel;
 	KernelDesc		integrateKernel;
 	KernelDesc		addVelocityKernel;
 	KernelDesc		updatePositionsFromVelocitiesKernel;
 	KernelDesc		updateVelocitiesFromPositionsWithoutVelocitiesKernel;
 	KernelDesc		updateVelocitiesFromPositionsWithVelocitiesKernel;
 	KernelDesc		resetNormalsAndAreasKernel;
 	KernelDesc		normalizeNormalsAndAreasKernel;
 	KernelDesc		updateSoftBodiesKernel;
 	KernelDesc		outputToVertexArrayWithNormalsKernel;
 	KernelDesc		outputToVertexArrayWithoutNormalsKernel;
 	KernelDesc		outputToVertexArrayKernel;
 	KernelDesc		applyForcesKernel;
 	KernelDesc		collideSphereKernel;
 	KernelDesc		collideCylinderKernel;
 	/**
 	 * Integrate motion on the solver.
 	 */
 	virtual void integrate( float solverdt );
 	float computeTriangleArea( 
 		const Vectormath::Aos::Point3 &vertex0,
 		const Vectormath::Aos::Point3 &vertex1,
 		const Vectormath::Aos::Point3 &vertex2 );
 	/**
 	 * Compile a compute shader kernel from a string and return the appropriate KernelDesc object.
 	 */
 	KernelDesc compileComputeShaderFromString( const char* shaderString, const char* shaderName, int constBufferSize, D3D10_SHADER_MACRO *compileMacros = 0 );
 	bool buildShaders();
 	void resetNormalsAndAreas( int numVertices );
 	void normalizeNormalsAndAreas( int numVertices );
 	void executeUpdateSoftBodies( int firstTriangle, int numTriangles );
 	Vectormath::Aos::Vector3 ProjectOnAxis( const Vectormath::Aos::Vector3 &v, const Vectormath::Aos::Vector3 &a );
 	void ApplyClampedForce( float solverdt, const Vectormath::Aos::Vector3 &force, const Vectormath::Aos::Vector3 &vertexVelocity, float inverseMass, Vectormath::Aos::Vector3 &vertexForce );
 	virtual void applyForces( float solverdt );
 	void updateConstants( float timeStep );
 	btAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
 	//////////////////////////////////////
 	// Kernel dispatches
 	void prepareLinks();
 	void updatePositionsFromVelocities( float solverdt );
 	void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
 	void solveLinksForVelocity( int startLink, int numLinks, float kst );
 	void updateVelocitiesFromPositionsWithVelocities( float isolverdt );
 	void updateVelocitiesFromPositionsWithoutVelocities( float isolverdt );
 	// End kernel dispatches
 	/////////////////////////////////////
 	void releaseKernels();
 public:
 	btDX11SIMDAwareSoftBodySolver(ID3D11Device * dx11Device, ID3D11DeviceContext* dx11Context);
 	virtual ~btDX11SIMDAwareSoftBodySolver();
 	virtual btSoftBodyLinkData &getLinkData();
 	virtual btSoftBodyVertexData &getVertexData();
 	virtual btSoftBodyTriangleData &getTriangleData();
 	virtual bool checkInitialized();
 	virtual void updateSoftBodies( );
 	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies );
 	virtual void solveConstraints( float solverdt );
 	virtual void predictMotion( float solverdt );
 	virtual void copySoftBodyToVertexBuffer( const btSoftBody *const softBody, btVertexBufferDescriptor *vertexBuffer );
 };
 #endif // #ifndef BT_SOFT_BODY_DX11_SOLVER_SIMDAWARE_H
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt
@@ -0,0 +1,82 @@
 INCLUDE_DIRECTORIES(
 ${BULLET_PHYSICS_SOURCE_DIR}/src
 )
 ADD_DEFINITIONS(-DUSE_AMD_OPENCL)
 ADD_DEFINITIONS(-DCL_PLATFORM_AMD)
 IF (INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 	INCLUDE_DIRECTORIES(		$ENV{==ATISTREAMSDKROOT=}/include )
 ELSE()
 	INCLUDE_DIRECTORIES(		$ENV{ATISTREAMSDKROOT}/include	)
 ENDIF()
 SET(BulletSoftBodyOpenCLSolvers_SRCS
 	../btSoftBodySolver_OpenCL.cpp
 )
 SET(BulletSoftBodyOpenCLSolvers_HDRS
 	../btSoftBodySolver_OpenCL.h
 	../../CPU/btSoftBodySolverData.h
 	../btSoftBodySolverVertexData_OpenCL.h
 	../btSoftBodySolverTriangleData_OpenCL.h
 	../btSoftBodySolverLinkData_OpenCL.h
 	../btSoftBodySolverBuffer_OpenCL.h
 )
 # OpenCL and HLSL Shaders.
 # Build rules generated to stringify these into headers
 # which are needed by some of the sources
 SET(BulletSoftBodyOpenCLSolvers_Shaders
 #	OutputToVertexArray
 	UpdateNormals
 	Integrate
 	UpdatePositions
 	UpdateNodes
 	SolvePositions
 	UpdatePositionsFromVelocities
 	ApplyForces
 	PrepareLinks
 	VSolveLinks
 )
 foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC/${f}.cl")
 endforeach(f) 
 ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_AMD
 	${BulletSoftBodyOpenCLSolvers_SRCS} 
 	${BulletSoftBodyOpenCLSolvers_HDRS} 
 	${BulletSoftBodyOpenCLSolvers_OpenCLC}
 )
 SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES VERSION ${BULLET_VERSION})
 SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES SOVERSION ${BULLET_VERSION})
 IF (BUILD_SHARED_LIBS)
 	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
 ENDIF (BUILD_SHARED_LIBS)
 IF (INSTALL_LIBS)
 	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
 			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_AMD DESTINATION .)
 			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_AMD DESTINATION lib${LIB_SUFFIX})
 				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
 			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
 		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES FRAMEWORK true)
 			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_AMD PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
 		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 ENDIF (INSTALL_LIBS)
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Apple/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/Apple/CMakeLists.txt
@@ -0,0 +1,73 @@
 INCLUDE_DIRECTORIES(
 ${BULLET_PHYSICS_SOURCE_DIR}/src
 )
 SET(BulletSoftBodyOpenCLSolvers_SRCS
 	../btSoftBodySolver_OpenCL.cpp
 )
 SET(BulletSoftBodyOpenCLSolvers_HDRS
 	../btSoftBodySolver_OpenCL.h
 	../../CPU/btSoftBodySolverData.h
 	../btSoftBodySolverVertexData_OpenCL.h
 	../btSoftBodySolverTriangleData_OpenCL.h
 	../btSoftBodySolverLinkData_OpenCL.h
 	../btSoftBodySolverBuffer_OpenCL.h
 )
 # OpenCL and HLSL Shaders.
 # Build rules generated to stringify these into headers
 # which are needed by some of the sources
 SET(BulletSoftBodyOpenCLSolvers_Shaders
 #	OutputToVertexArray
 	UpdateNormals
 	Integrate
 	UpdatePositions
 	UpdateNodes
 	SolvePositions
 	UpdatePositionsFromVelocities
 	ApplyForces
 	PrepareLinks
 	VSolveLinks
 )
 foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC10/${f}.cl")
 endforeach(f) 
 ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_Apple
 	${BulletSoftBodyOpenCLSolvers_SRCS} 
 	${BulletSoftBodyOpenCLSolvers_HDRS} 
 	${BulletSoftBodyOpenCLSolvers_OpenCLC}
 )
 SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES VERSION ${BULLET_VERSION})
 SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES SOVERSION ${BULLET_VERSION})
 IF (BUILD_SHARED_LIBS)
 	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
 ENDIF (BUILD_SHARED_LIBS)
 IF (INSTALL_LIBS)
 	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
 			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Apple  DESTINATION .)
 			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Apple  DESTINATION lib${LIB_SUFFIX})
 				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
 			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
 		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES FRAMEWORK true)
 			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Apple PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
 		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 ENDIF (INSTALL_LIBS)
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/CMakeLists.txt
@@ -1,71 +1,16 @@
-INCLUDE_DIRECTORIES(
+IF(BUILD_MINICL_OPENCL_DEMOS)
-${BULLET_PHYSICS_SOURCE_DIR}/src
+	SUBDIRS( MiniCL  )
-)
+ENDIF()
 IF(BUILD_AMD_OPENCL_DEMOS)
 	SUBDIRS(AMD)
 ENDIF()
-SET(OPENCL_DIR $ENV{ATISTREAMSDKROOT})
+IF(BUILD_NVIDIA_OPENCL_DEMOS)
-SET(OPENCL_INCLUDE_PATH "${ATISTREAMSDKROOT}/include" CACHE DOCSTRING "OpenCL SDK include path")
+	SUBDIRS(NVidia)
 ENDIF()
-INCLUDE_DIRECTORIES(${OPENCL_INCLUDE_PATH} "../cpu/")
+IF(APPLE)
-
+	SUBDIRS(Apple)
-SET(BulletSoftBodyOpenCLSolvers_SRCS
+ENDIF()
 	btSoftBodySolver_OpenCL.cpp
 )
 SET(BulletSoftBodyOpenCLSolvers_HDRS
 	btSoftBodySolver_OpenCL.h
 	../cpu/btSoftBodySolverData.h
 	btSoftBodySolverVertexData_OpenCL.h
 	btSoftBodySolverTriangleData_OpenCL.h
 	btSoftBodySolverLinkData_OpenCL.h
 	btSoftBodySolverBuffer_OpenCL.h
 )
 # OpenCL and HLSL Shaders.
 # Build rules generated to stringify these into headers
 # which are needed by some of the sources
 SET(BulletSoftBodyOpenCLSolvers_Shaders
 #	OutputToVertexArray
 	UpdateNormals
 	Integrate
 	UpdatePositions
 	UpdateNodes
 	SolvePositions
 	UpdatePositionsFromVelocities
 	ApplyForces
 	PrepareLinks
 	VSolveLinks
 )
 foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "OpenCLC/${f}.cl")
 endforeach(f) 
 ADD_LIBRARY(BulletSoftBodySolvers_OpenCL  ${BulletSoftBodyOpenCLSolvers_SRCS} ${BulletSoftBodyOpenCLSolvers_HDRS} ${BulletSoftBodyOpenCLSolvers_OpenCLC})
 SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL PROPERTIES VERSION ${BULLET_VERSION})
 SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL PROPERTIES SOVERSION ${BULLET_VERSION})
 IF (BUILD_SHARED_LIBS)
 	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
 ENDIF (BUILD_SHARED_LIBS)
 IF (INSTALL_LIBS)
 	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
 			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL DESTINATION .)
 			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL DESTINATION lib${LIB_SUFFIX})
 				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
 			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
 		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL PROPERTIES FRAMEWORK true)
 			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
 		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 ENDIF (INSTALL_LIBS)
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/CMakeLists.txt
@@ -0,0 +1,75 @@
 INCLUDE_DIRECTORIES(
 ${BULLET_PHYSICS_SOURCE_DIR}/src
 )
 ADD_DEFINITIONS(-DUSE_MINICL)
 SET(BulletSoftBodyOpenCLSolvers_SRCS
 	../btSoftBodySolver_OpenCL.cpp
 )
 SET(BulletSoftBodyOpenCLSolvers_HDRS
 	../btSoftBodySolver_OpenCL.h
 	../../CPU/btSoftBodySolverData.h
 	../btSoftBodySolverVertexData_OpenCL.h
 	../btSoftBodySolverTriangleData_OpenCL.h
 	../btSoftBodySolverLinkData_OpenCL.h
 	../btSoftBodySolverBuffer_OpenCL.h
 )
 # OpenCL and HLSL Shaders.
 # Build rules generated to stringify these into headers
 # which are needed by some of the sources
 SET(BulletSoftBodyOpenCLSolvers_Shaders
 #	OutputToVertexArray
 	UpdateNormals
 	Integrate
 	UpdatePositions
 	UpdateNodes
 	SolvePositions
 	UpdatePositionsFromVelocities
 	ApplyForces
 	PrepareLinks
 	VSolveLinks
 )
 foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC10/${f}.cl")
 endforeach(f) 
 ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_Mini
 	${BulletSoftBodyOpenCLSolvers_SRCS} 
 	${BulletSoftBodyOpenCLSolvers_HDRS} 
 	${BulletSoftBodyOpenCLSolvers_OpenCLC}
 )
 SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES VERSION ${BULLET_VERSION})
 SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES SOVERSION ${BULLET_VERSION})
 IF (BUILD_SHARED_LIBS)
 	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
 ENDIF (BUILD_SHARED_LIBS)
 IF (INSTALL_LIBS)
 	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
 			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Mini DESTINATION .)
 			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_Mini DESTINATION lib${LIB_SUFFIX})
 				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
 			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
 		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES FRAMEWORK true)
 			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_Mini PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
 		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 ENDIF (INSTALL_LIBS)
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/MiniCL/MiniCLTaskWrap.cpp
@@ -0,0 +1,40 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #include <MiniCL/cl_MiniCL_Defs.h>
 #define MSTRINGIFY(A) A
 #include "../OpenCLC10/ApplyForces.cl"
 #include "../OpenCLC10/Integrate.cl"
 #include "../OpenCLC10/PrepareLinks.cl"
 #include "../OpenCLC10/SolvePositions.cl"
 #include "../OpenCLC10/UpdateNodes.cl"
 #include "../OpenCLC10/UpdateNormals.cl"
 #include "../OpenCLC10/UpdatePositions.cl"
 #include "../OpenCLC10/UpdatePositionsFromVelocities.cl"
 //#include "../OpenCLC10/VSolveLinks.cl"
 MINICL_REGISTER(PrepareLinksKernel)
 MINICL_REGISTER(UpdatePositionsFromVelocitiesKernel)
 MINICL_REGISTER(SolvePositionsFromLinksKernel)
 MINICL_REGISTER(updateVelocitiesFromPositionsWithVelocitiesKernel)
 MINICL_REGISTER(updateVelocitiesFromPositionsWithoutVelocitiesKernel)
 MINICL_REGISTER(IntegrateKernel)
 MINICL_REGISTER(ApplyForcesKernel)
 MINICL_REGISTER(ResetNormalsAndAreasKernel)
 MINICL_REGISTER(NormalizeNormalsAndAreasKernel)
 MINICL_REGISTER(UpdateSoftBodiesKernel)
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt
@@ -0,0 +1,79 @@
 INCLUDE_DIRECTORIES(
 ${BULLET_PHYSICS_SOURCE_DIR}/src
 )
 IF(INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 	INCLUDE_DIRECTORIES( $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/inc	)
 ELSE()
 	INCLUDE_DIRECTORIES( $ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/inc	)
 ENDIF()
 SET(BulletSoftBodyOpenCLSolvers_SRCS
 	../btSoftBodySolver_OpenCL.cpp
 )
 SET(BulletSoftBodyOpenCLSolvers_HDRS
 	../btSoftBodySolver_OpenCL.h
 	../../CPU/btSoftBodySolverData.h
 	../btSoftBodySolverVertexData_OpenCL.h
 	../btSoftBodySolverTriangleData_OpenCL.h
 	../btSoftBodySolverLinkData_OpenCL.h
 	../btSoftBodySolverBuffer_OpenCL.h
 )
 # OpenCL and HLSL Shaders.
 # Build rules generated to stringify these into headers
 # which are needed by some of the sources
 SET(BulletSoftBodyOpenCLSolvers_Shaders
 #	OutputToVertexArray
 	UpdateNormals
 	Integrate
 	UpdatePositions
 	UpdateNodes
 	SolvePositions
 	UpdatePositionsFromVelocities
 	ApplyForces
 	PrepareLinks
 	VSolveLinks
 )
 foreach(f ${BulletSoftBodyOpenCLSolvers_Shaders})
    LIST(APPEND BulletSoftBodyOpenCLSolvers_OpenCLC "../OpenCLC/${f}.cl")
 endforeach(f) 
 ADD_LIBRARY(BulletSoftBodySolvers_OpenCL_NVidia
 	${BulletSoftBodyOpenCLSolvers_SRCS} 
 	${BulletSoftBodyOpenCLSolvers_HDRS} 
 	${BulletSoftBodyOpenCLSolvers_OpenCLC}
 )
 SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES VERSION ${BULLET_VERSION})
 SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES SOVERSION ${BULLET_VERSION})
 IF (BUILD_SHARED_LIBS)
 	TARGET_LINK_LIBRARIES(BulletSoftBody BulletDynamics)
 ENDIF (BUILD_SHARED_LIBS)
 IF (INSTALL_LIBS)
 	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
 			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 				INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_NVidia DESTINATION .)
 			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 			INSTALL(TARGETS BulletSoftBodySolvers_OpenCL_NVidia DESTINATION lib${LIB_SUFFIX})
 				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DESTINATION include FILES_MATCHING PATTERN "*.h")
 			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
 		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES FRAMEWORK true)
 			SET_TARGET_PROPERTIES(BulletSoftBodySolvers_OpenCL_NVidia PROPERTIES PUBLIC_HEADER "${BulletSoftBodyOpenCLSolvers_HDRS}")
 		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
 	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 ENDIF (INSTALL_LIBS)
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ApplyForces.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ApplyForces.cl
@@ -0,0 +1,91 @@
 MSTRINGIFY(
 float adot3(float4 a, float4 b)
 {
   return a.x*b.x + a.y*b.y + a.z*b.z;
 }
 float4 projectOnAxis( float4 v, float4 a )
 {
 	return (a*adot3(v, a));
 }
 __kernel void 
 ApplyForcesKernel(
 	const uint numNodes,
 	const float solverdt,
 	const float epsilon,
 	__global int * g_vertexClothIdentifier,
 	__global float4 * g_vertexNormal,
 	__global float * g_vertexArea,
 	__global float * g_vertexInverseMass,
 	__global float * g_clothLiftFactor,
 	__global float * g_clothDragFactor,
 	__global float4 * g_clothWindVelocity,
 	__global float4 * g_clothAcceleration,
 	__global float * g_clothMediumDensity,
 	__global float4 * g_vertexForceAccumulator,
 	__global float4 * g_vertexVelocity GUID_ARG)
 {
 	unsigned int nodeID = get_global_id(0);
 	if( nodeID < numNodes )
 	{		
 		int clothId  = g_vertexClothIdentifier[nodeID];
 		float nodeIM = g_vertexInverseMass[nodeID];
 		if( nodeIM > 0.0f )
 		{
 			float4 nodeV  = g_vertexVelocity[nodeID];
 			float4 normal = g_vertexNormal[nodeID];
 			float area    = g_vertexArea[nodeID];
 			float4 nodeF  = g_vertexForceAccumulator[nodeID];
 			// Read per-cloth values
 			float4 clothAcceleration = g_clothAcceleration[clothId];
 			float4 clothWindVelocity = g_clothWindVelocity[clothId];
 			float liftFactor = g_clothLiftFactor[clothId];
 			float dragFactor = g_clothDragFactor[clothId];
 			float mediumDensity = g_clothMediumDensity[clothId];
 			// Apply the acceleration to the cloth rather than do this via a force
 			nodeV += (clothAcceleration*solverdt);
 			g_vertexVelocity[nodeID] = nodeV;
 			float4 relativeWindVelocity = nodeV - clothWindVelocity;
 			float relativeSpeedSquared = dot(relativeWindVelocity, relativeWindVelocity);
 			if( relativeSpeedSquared > epsilon )
 			{
 				// Correct direction of normal relative to wind direction and get dot product
 				normal = normal * (dot(normal, relativeWindVelocity) < 0 ? -1.f : 1.f);
 				float dvNormal = dot(normal, relativeWindVelocity);
 				if( dvNormal > 0 )
 				{
 					float4 force = (float4)(0.f, 0.f, 0.f, 0.f);
 					float c0 = area * dvNormal * relativeSpeedSquared / 2.f;
 					float c1 = c0 * mediumDensity;
 					force += normal * (-c1 * liftFactor);
 					force += normalize(relativeWindVelocity)*(-c1 * dragFactor);
 					float dtim = solverdt * nodeIM;
 					float4 forceDTIM = force * dtim;
 					float4 nodeFPlusForce = nodeF + force;
 					// m_nodesf[i] -= ProjectOnAxis(m_nodesv[i], force.normalized())/dtim;	
 					float4 nodeFMinus = nodeF - (projectOnAxis(nodeV, normalize(force))/dtim);
 					nodeF = nodeFPlusForce;
 					if( dot(forceDTIM, forceDTIM) > dot(nodeV, nodeV) )
 						nodeF = nodeFMinus;
 					g_vertexForceAccumulator[nodeID] = nodeF;	
 				}
 			}
 		}
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/Integrate.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/Integrate.cl
@@ -0,0 +1,35 @@
 MSTRINGIFY(
 // Node indices for each link
 __kernel void
 IntegrateKernel( 
 	const int numNodes,
 	const float solverdt,
 	__global float * g_vertexInverseMasses,
 	__global float4 * g_vertexPositions,
 	__global float4 * g_vertexVelocity,
 	__global float4 * g_vertexPreviousPositions,
 	__global float4 * g_vertexForceAccumulator GUID_ARG)
 {
 	int nodeID = get_global_id(0);
 	if( nodeID < numNodes )
 	{	
 		float4 position   = g_vertexPositions[nodeID];
 		float4 velocity   = g_vertexVelocity[nodeID];
 		float4 force      = g_vertexForceAccumulator[nodeID];
 		float inverseMass = g_vertexInverseMasses[nodeID];
 		g_vertexPreviousPositions[nodeID] = position;
 		velocity += force * inverseMass * solverdt;
 		position += velocity * solverdt;
 		g_vertexForceAccumulator[nodeID] = (float4)(0.f, 0.f, 0.f, 0.0f);
 		g_vertexPositions[nodeID]        = position;
 		g_vertexVelocity[nodeID]         = velocity;	
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/PrepareLinks.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/PrepareLinks.cl
@@ -0,0 +1,41 @@
 MSTRINGIFY(
 float dot3(float4 a, float4 b)
 {
   return a.x*b.x + a.y*b.y + a.z*b.z;
 }
 __kernel void 
 PrepareLinksKernel( 
 	const int numLinks,
 	__global int2 * g_linksVertexIndices,
 	__global float * g_linksMassLSC,
 	__global float4 * g_nodesPreviousPosition,
 	__global float * g_linksLengthRatio,
 	__global float4 * g_linksCurrentLength GUID_ARG)
 {
 	int linkID = get_global_id(0);
 	if( linkID < numLinks )
 	{	
 		int2 nodeIndices = g_linksVertexIndices[linkID];
 		int node0 = nodeIndices.x;
 		int node1 = nodeIndices.y;
 		float4 nodePreviousPosition0 = g_nodesPreviousPosition[node0];
 		float4 nodePreviousPosition1 = g_nodesPreviousPosition[node1];
 		float massLSC = g_linksMassLSC[linkID];
 		float4 linkCurrentLength = nodePreviousPosition1 - nodePreviousPosition0;
 		float linkLengthRatio = dot3(linkCurrentLength, linkCurrentLength)*massLSC;
 		linkLengthRatio = 1.0f/linkLengthRatio;
 		g_linksCurrentLength[linkID] = linkCurrentLength;
 		g_linksLengthRatio[linkID]   = linkLengthRatio;		
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositions.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositions.cl
@@ -0,0 +1,57 @@
 MSTRINGIFY(
 float mydot3(float4 a, float4 b)
 {
   return a.x*b.x + a.y*b.y + a.z*b.z;
 }
 __kernel void 
 SolvePositionsFromLinksKernel( 
 	const int startLink,
 	const int numLinks,
 	const float kst,
 	const float ti,
 	__global int2 * g_linksVertexIndices,
 	__global float * g_linksMassLSC,
 	__global float * g_linksRestLengthSquared,
 	__global float * g_verticesInverseMass,
 	__global float4 * g_vertexPositions GUID_ARG)
 {
 	int linkID = get_global_id(0) + startLink;
 	if( get_global_id(0) < numLinks )
 	{	
 		float massLSC = g_linksMassLSC[linkID];
 		float restLengthSquared = g_linksRestLengthSquared[linkID];
 		if( massLSC > 0.0f )
 		{		
 			int2 nodeIndices = g_linksVertexIndices[linkID];
 			int node0 = nodeIndices.x;
 			int node1 = nodeIndices.y;
 			float4 position0 = g_vertexPositions[node0];
 			float4 position1 = g_vertexPositions[node1];
 			float inverseMass0 = g_verticesInverseMass[node0];
 			float inverseMass1 = g_verticesInverseMass[node1]; 
 			float4 del = position1 - position0;
 			float len  = mydot3(del, del);
 			float k    = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst;
 			position0 = position0 - del*(k*inverseMass0);
 			position1 = position1 + del*(k*inverseMass1);
 			g_vertexPositions[node0] = position0;
 			g_vertexPositions[node1] = position1;
 		}
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateConstants.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateConstants.cl
@@ -0,0 +1,44 @@
 MSTRINGIFY(
 /*#define float3 float4
 float dot3(float3 a, float3 b)
 {
   return a.x*b.x + a.y*b.y + a.z*b.z;
 }*/
 __kernel void 
 UpdateConstantsKernel( 
 	const int numLinks,
 	__global int2 * g_linksVertexIndices,
 	__global float4 * g_vertexPositions,
 	__global float * g_vertexInverseMasses,
 	__global float * g_linksMaterialLSC,
 	__global float * g_linksMassLSC,
 	__global float * g_linksRestLengthSquared,
 	__global float * g_linksRestLengths)
 {
 	int linkID = get_global_id(0);
 	if( linkID < numLinks )
 	{	
 		int2 nodeIndices = g_linksVertexIndices[linkID];
 		int node0 = nodeIndices.x;
 		int node1 = nodeIndices.y;
 		float linearStiffnessCoefficient = g_linksMaterialLSC[ linkID ];
 		float3 position0   = g_vertexPositions[node0].xyz;
 		float3 position1   = g_vertexPositions[node1].xyz;
 		float inverseMass0 = g_vertexInverseMasses[node0];
 		float inverseMass1 = g_vertexInverseMasses[node1];
 		float3 difference = position0 - position1;
 		float length2 = dot(difference, difference);
 		float length = sqrt(length2);
 		g_linksRestLengths[linkID] = length;
 		g_linksMassLSC[linkID] = (inverseMass0 + inverseMass1)/linearStiffnessCoefficient;
 		g_linksRestLengthSquared[linkID] = length*length;		
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNodes.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNodes.cl
@@ -0,0 +1,39 @@
 MSTRINGIFY(
 __kernel void 
 updateVelocitiesFromPositionsWithVelocitiesKernel( 
 	int numNodes,
 	float isolverdt,
 	__global float4 * g_vertexPositions,
 	__global float4 * g_vertexPreviousPositions,
 	__global int * g_vertexClothIndices,
 	__global float *g_clothVelocityCorrectionCoefficients,
 	__global float * g_clothDampingFactor,
 	__global float4 * g_vertexVelocities,
 	__global float4 * g_vertexForces GUID_ARG)
 {
 	int nodeID = get_global_id(0);
 	if( nodeID < numNodes )
 	{	
 		float4 position = g_vertexPositions[nodeID];
 		float4 previousPosition = g_vertexPreviousPositions[nodeID];
 		float4 velocity = g_vertexVelocities[nodeID];
 		int clothIndex = g_vertexClothIndices[nodeID];
 		float velocityCorrectionCoefficient = g_clothVelocityCorrectionCoefficients[clothIndex];
 		float dampingFactor = g_clothDampingFactor[clothIndex];
 		float velocityCoefficient = (1.f - dampingFactor);
 		float4 difference = position - previousPosition;
 		velocity += difference*velocityCorrectionCoefficient*isolverdt;
 		// Damp the velocity
 		velocity *= velocityCoefficient;
 		g_vertexVelocities[nodeID] = velocity;
 		g_vertexForces[nodeID] = (float4)(0.f, 0.f, 0.f, 0.f);								
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNormals.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdateNormals.cl
@@ -0,0 +1,102 @@
 MSTRINGIFY(
 float length3(float4 a)
 {
 	a.w = 0;
 	return length(a);
 }
 float4 normalize3(float4 a)
 {
 	a.w = 0;
 	return normalize(a);
 }
 __kernel void 
 ResetNormalsAndAreasKernel(
 	const unsigned int numNodes,
 	__global float4 * g_vertexNormals,
 	__global float * g_vertexArea GUID_ARG)
 {
 	if( get_global_id(0) < numNodes )
 	{
 		g_vertexNormals[get_global_id(0)] = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
 		g_vertexArea[get_global_id(0)]    = 0.0f;
 	}
 }
 __kernel void 
 UpdateSoftBodiesKernel(
 	const unsigned int startFace,
 	const unsigned int numFaces,
 	__global int4 * g_triangleVertexIndexSet,
 	__global float4 * g_vertexPositions,
 	__global float4 * g_vertexNormals,
 	__global float * g_vertexArea,
 	__global float4 * g_triangleNormals,
 	__global float * g_triangleArea GUID_ARG)
 {
 	int faceID = get_global_id(0) + startFace;
 	if( get_global_id(0) < numFaces )
 	{		
 		int4 triangleIndexSet = g_triangleVertexIndexSet[ faceID ];
 		int nodeIndex0 = triangleIndexSet.x;
 		int nodeIndex1 = triangleIndexSet.y;
 		int nodeIndex2 = triangleIndexSet.z;
 		float4 node0 = g_vertexPositions[nodeIndex0];
 		float4 node1 = g_vertexPositions[nodeIndex1];
 		float4 node2 = g_vertexPositions[nodeIndex2];
 		float4 nodeNormal0 = g_vertexNormals[nodeIndex0];
 		float4 nodeNormal1 = g_vertexNormals[nodeIndex1];
 		float4 nodeNormal2 = g_vertexNormals[nodeIndex2];
 		float vertexArea0 = g_vertexArea[nodeIndex0];
 		float vertexArea1 = g_vertexArea[nodeIndex1];
 		float vertexArea2 = g_vertexArea[nodeIndex2];
 		float4 vector0 = node1 - node0;
 		float4 vector1 = node2 - node0;
 		float4 faceNormal = cross(vector0, vector1);
 		float triangleArea = length(faceNormal);
 		nodeNormal0 = nodeNormal0 + faceNormal;
 		nodeNormal1 = nodeNormal1 + faceNormal;
 		nodeNormal2 = nodeNormal2 + faceNormal;
 		vertexArea0 = vertexArea0 + triangleArea;
 		vertexArea1 = vertexArea1 + triangleArea;
 		vertexArea2 = vertexArea2 + triangleArea;
 		g_triangleNormals[faceID] = normalize3(faceNormal);
 		g_vertexNormals[nodeIndex0] = nodeNormal0;
 		g_vertexNormals[nodeIndex1] = nodeNormal1;
 		g_vertexNormals[nodeIndex2] = nodeNormal2;
 		g_triangleArea[faceID] = triangleArea;
 		g_vertexArea[nodeIndex0] = vertexArea0;
 		g_vertexArea[nodeIndex1] = vertexArea1;
 		g_vertexArea[nodeIndex2] = vertexArea2;
 	}
 }
 __kernel void 
 NormalizeNormalsAndAreasKernel( 
 	const unsigned int numNodes,
 	__global int * g_vertexTriangleCount,
 	__global float4 * g_vertexNormals,
 	__global float * g_vertexArea GUID_ARG)
 {
 	if( get_global_id(0) < numNodes )
 	{
 		float4 normal = g_vertexNormals[get_global_id(0)];
 		float area = g_vertexArea[get_global_id(0)];
 		int numTriangles = g_vertexTriangleCount[get_global_id(0)];
 		float vectorLength = length3(normal);
 		g_vertexNormals[get_global_id(0)] = normalize3(normal);
 		g_vertexArea[get_global_id(0)] = area/(float)(numTriangles);
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositions.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositions.cl
@@ -0,0 +1,34 @@
 MSTRINGIFY(
 __kernel void 
 updateVelocitiesFromPositionsWithoutVelocitiesKernel( 
 	const int numNodes,
 	const float isolverdt,
 	__global float4 * g_vertexPositions,
 	__global float4 * g_vertexPreviousPositions,
 	__global int * g_vertexClothIndices,
 	__global float * g_clothDampingFactor,
 	__global float4 * g_vertexVelocities,
 	__global float4 * g_vertexForces GUID_ARG)
 {
 	int nodeID = get_global_id(0);
 	if( nodeID < numNodes )
 	{	
 		float4 position = g_vertexPositions[nodeID];
 		float4 previousPosition = g_vertexPreviousPositions[nodeID];
 		float4 velocity = g_vertexVelocities[nodeID];
 		int clothIndex = g_vertexClothIndices[nodeID];
 		float dampingFactor = g_clothDampingFactor[clothIndex];
 		float velocityCoefficient = (1.f - dampingFactor);
 		float4 difference = position - previousPosition;
 		velocity = difference*velocityCoefficient*isolverdt;		
 		g_vertexVelocities[nodeID] = velocity;
 		g_vertexForces[nodeID] = (float4)(0.f, 0.f, 0.f, 0.f);								
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositionsFromVelocities.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/UpdatePositionsFromVelocities.cl
@@ -0,0 +1,28 @@
 MSTRINGIFY(
 __kernel void 
 UpdatePositionsFromVelocitiesKernel( 
 	const int numNodes,
 	const float solverSDT,
 	__global float4 * g_vertexVelocities,
 	__global float4 * g_vertexPreviousPositions,
 	__global float4 * g_vertexCurrentPosition GUID_ARG)
 {
 	int vertexID = get_global_id(0);
 	if( vertexID < numNodes )
 	{	
 		float4 previousPosition = g_vertexPreviousPositions[vertexID];
 		float4 velocity         = g_vertexVelocities[vertexID];
 		float4 newPosition      = previousPosition + velocity*solverSDT;
 		g_vertexCurrentPosition[vertexID]   = newPosition;
 		g_vertexPreviousPositions[vertexID] = newPosition;
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/VSolveLinks.cl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/VSolveLinks.cl
@@ -0,0 +1,45 @@
 MSTRINGIFY(
 __kernel void 
 VSolveLinksKernel( 
 	int startLink,
 	int numLinks,
 	float kst,
 	__global int2 * g_linksVertexIndices,
 	__global float * g_linksLengthRatio,
 	__global float4 * g_linksCurrentLength,
 	__global float * g_vertexInverseMass,
 	__global float4 * g_vertexVelocity GUID_ARG)
 {
 	int linkID = get_global_id(0) + startLink;
 	if( get_global_id(0) < numLinks )
 	{		
 		int2 nodeIndices = g_linksVertexIndices[linkID];
 		int node0 = nodeIndices.x;
 		int node1 = nodeIndices.y;
 		float linkLengthRatio = g_linksLengthRatio[linkID];
 		float3 linkCurrentLength = g_linksCurrentLength[linkID].xyz;
 		float3 vertexVelocity0 = g_vertexVelocity[node0].xyz;
 		float3 vertexVelocity1 = g_vertexVelocity[node1].xyz;
 		float vertexInverseMass0 = g_vertexInverseMass[node0];
 		float vertexInverseMass1 = g_vertexInverseMass[node1]; 
 		float3 nodeDifference = vertexVelocity0 - vertexVelocity1;
 		float dotResult = dot(linkCurrentLength, nodeDifference);
 		float j = -dotResult*linkLengthRatio*kst;
 		float3 velocityChange0 = linkCurrentLength*(j*vertexInverseMass0);
 		float3 velocityChange1 = linkCurrentLength*(j*vertexInverseMass1);
 		vertexVelocity0 += velocityChange0;
 		vertexVelocity1 -= velocityChange1;
 		g_vertexVelocity[node0] = (float4)(vertexVelocity0, 0.f);
 		g_vertexVelocity[node1] = (float4)(vertexVelocity1, 0.f);
 	}
 }
 );
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h
@@ -17,7 +17,16 @@ subject to the following restrictions:
 #define BT_SOFT_BODY_SOLVER_BUFFER_OPENCL_H
 // OpenCL support
-#include <CL/cl.hpp>
+
 #ifdef USE_MINICL
 	#include "MiniCL/cl.h"
 #else //USE_MINICL
 	#ifdef __APPLE__
 		#include <OpenCL/OpenCL.h>
 	#else
 		#include <CL/cl.h>
 	#endif //__APPLE__
 #endif//USE_MINICL
 #ifndef SAFE_RELEASE
 #define SAFE_RELEASE(p)      { if(p) { (p)->Release(); (p)=NULL; } }
@@ -25,22 +34,25 @@ subject to the following restrictions:
 template <typename ElementType> class btOpenCLBuffer
 {
-protected:
+public:
 	cl::CommandQueue m_queue;
 	btAlignedObjectArray< ElementType > * m_CPUBuffer;
 	cl::Buffer m_buffer;
 	cl_command_queue	m_cqCommandQue;
 	cl_context			m_clContext;
 	cl_mem				m_buffer;
 	btAlignedObjectArray< ElementType > * m_CPUBuffer;
 	int  m_gpuSize;
 	bool m_onGPU;
 	bool m_readOnlyOnGPU;
 	bool m_allocated;
 	// TODO: Remove this once C++ bindings are fixed
 	cl::Context context;
-	bool createBuffer( cl::Buffer *preexistingBuffer = 0)
+
 	bool createBuffer( cl_mem* preexistingBuffer = 0)
 	{
 		cl_int err;
@@ -49,12 +61,11 @@ protected:
 			m_buffer = *preexistingBuffer;
 		} 
 		else {
-			m_buffer = cl::Buffer(
+
-					context, 
+			cl_mem_flags flags= m_readOnlyOnGPU ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
-					m_readOnlyOnGPU ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE, 
+
-					m_CPUBuffer->size() * sizeof(ElementType), 
+			size_t size = m_CPUBuffer->size() * sizeof(ElementType);
-					0, 
+			m_buffer = clCreateBuffer(m_clContext, flags, size, 0, &err);
 					&err);
 			if( err != CL_SUCCESS )
 			{
 				btAssert( "Buffer::Buffer(m_buffer)");
@@ -62,35 +73,31 @@ protected:
 		}
 		m_gpuSize = m_CPUBuffer->size();
 		return true;
 	}
 public:
-	btOpenCLBuffer( 
+	btOpenCLBuffer( cl_command_queue	commandQue,cl_context ctx, btAlignedObjectArray< ElementType >* CPUBuffer, bool readOnly)
-		cl::CommandQueue queue,
+		:m_cqCommandQue(commandQue),
-		btAlignedObjectArray< ElementType > *CPUBuffer, 
+		m_clContext(ctx),
 		bool readOnly) :
 		m_queue(queue),
 		m_CPUBuffer(CPUBuffer),
 		m_gpuSize(0),
 		m_onGPU(false),
 		m_readOnlyOnGPU(readOnly),
 		m_allocated(false)
 	{
 		context = m_queue.getInfo<CL_QUEUE_CONTEXT>();
 	}
 	~btOpenCLBuffer()
 	{
 	}
 	cl::Buffer getBuffer()
 	{
 		return m_buffer;
 	}
 	bool moveToGPU()
 	{
 		cl_int err;
 		if( (m_CPUBuffer->size() != m_gpuSize) )
@@ -107,12 +114,12 @@ public:
 				m_allocated = true;
 			}
-			err = m_queue.enqueueWriteBuffer(
+			size_t size = m_CPUBuffer->size() * sizeof(ElementType);
-				m_buffer,
+			err = clEnqueueWriteBuffer(m_cqCommandQue,m_buffer,
 				CL_FALSE,
 				0,
-				m_CPUBuffer->size() * sizeof(ElementType), 
+				size, 
-				&((*m_CPUBuffer)[0]));
+				&((*m_CPUBuffer)[0]),0,0,0);
 			if( err != CL_SUCCESS )
 			{
 				btAssert( "CommandQueue::enqueueWriteBuffer(m_buffer)" );
@@ -122,20 +129,23 @@ public:
 		}
 		return true;
 	}
 	bool moveFromGPU()
 	{
 		cl_int err;
 		if (m_CPUBuffer->size() > 0) {
 			if (m_onGPU && !m_readOnlyOnGPU) {
-				err = m_queue.enqueueReadBuffer(
+				size_t size = m_CPUBuffer->size() * sizeof(ElementType);
 				err = clEnqueueReadBuffer(m_cqCommandQue,
 					m_buffer,
 					CL_TRUE,
 					0,
-					m_CPUBuffer->size() * sizeof(ElementType), 
+					size,
-					&((*m_CPUBuffer)[0]));
+					&((*m_CPUBuffer)[0]),0,0,0);
 				if( err != CL_SUCCESS )
 				{
@@ -151,16 +161,17 @@ public:
 	bool copyFromGPU()
 	{
 		cl_int err;
 		size_t size = m_CPUBuffer->size() * sizeof(ElementType);
 		if (m_CPUBuffer->size() > 0) {
 			if (m_onGPU && !m_readOnlyOnGPU) {
-				err = m_queue.enqueueReadBuffer(
+				err = clEnqueueReadBuffer(m_cqCommandQue,
 					m_buffer,
 					CL_TRUE,
-					0,
+					0,size, 
-					m_CPUBuffer->size() * sizeof(ElementType), 
+					&((*m_CPUBuffer)[0]),0,0,0);
 					&((*m_CPUBuffer)[0]));
 				if( err != CL_SUCCESS )
 				{
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h
@@ -13,8 +13,8 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
-#include "BulletSoftBody/Solvers/CPU/btSoftBodySolverData.h"
+#include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h"
-#include "BulletSoftBody/Solvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
 #ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_H
@@ -25,7 +25,9 @@ class btSoftBodyLinkDataOpenCL : public btSoftBodyLinkData
 {
 public:
 	bool				m_onGPU;
-	cl::CommandQueue	m_queue;
+
 	cl_command_queue	m_cqCommandQue;
 	btOpenCLBuffer<LinkNodePair> m_clLinks;
 	btOpenCLBuffer<float>							      m_clLinkStrength;
@@ -36,6 +38,24 @@ public:
 	btOpenCLBuffer<float>								  m_clLinksRestLength;
 	btOpenCLBuffer<float>								  m_clLinksMaterialLinearStiffnessCoefficient;
 	struct BatchPair
 	{
 		int start;
 		int length;
 		BatchPair() :
 			start(0),
 			length(0)
 		{
 		}
 		BatchPair( int s, int l ) : 
 			start( s ),
 			length( l )
 		{
 		}
 	};
 	/**
 	 * Link addressing information for each cloth.
 	 * Allows link locations to be computed independently of data batching.
@@ -45,9 +65,9 @@ public:
 	/**
 	 * Start and length values for computation batches over link data.
 	 */
-	btAlignedObjectArray< std::pair< int, int > >		m_batchStartLengths;
+	btAlignedObjectArray< BatchPair >		m_batchStartLengths;
-	btSoftBodyLinkDataOpenCL(cl::CommandQueue queue);
+	btSoftBodyLinkDataOpenCL(cl_command_queue queue, cl_context ctx);
 	virtual ~btSoftBodyLinkDataOpenCL();
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h
@@ -14,8 +14,8 @@ subject to the following restrictions:
 */
-#include "BulletSoftBody/Solvers/CPU/btSoftBodySolverData.h"
+#include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h"
-#include "BulletSoftBody/Solvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
 #ifndef BT_SOFT_BODY_SOLVER_TRIANGLE_DATA_OPENCL_H
@@ -26,7 +26,7 @@ class btSoftBodyTriangleDataOpenCL : public btSoftBodyTriangleData
 {
 public:
 	bool				m_onGPU;
-	cl::CommandQueue    m_queue;
+	cl_command_queue    m_queue;
 	btOpenCLBuffer<btSoftBodyTriangleData::TriangleNodeSet>					m_clVertexIndices;
 	btOpenCLBuffer<float>								m_clArea;
@@ -41,10 +41,20 @@ public:
 	/**
 	 * Start and length values for computation batches over link data.
 	 */
-	btAlignedObjectArray< std::pair< int, int > >		m_batchStartLengths;
+	struct btSomePair
 	{
 		btSomePair() {}
 		btSomePair(int f,int s)
 			:first(f),second(s)
 		{
 		}
 		int first;
 		int second;
 	};
 	btAlignedObjectArray< btSomePair >		m_batchStartLengths;
 public:
-	btSoftBodyTriangleDataOpenCL( cl::CommandQueue queue );
+	btSoftBodyTriangleDataOpenCL( cl_command_queue queue, cl_context ctx );
 	virtual ~btSoftBodyTriangleDataOpenCL();
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h
@@ -13,8 +13,8 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
-#include "BulletSoftBody/Solvers/CPU/btSoftBodySolverData.h"
+#include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h"
-#include "BulletSoftBody/Solvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
 #ifndef BT_SOFT_BODY_SOLVER_VERTEX_DATA_OPENCL_H
 #define BT_SOFT_BODY_SOLVER_VERTEX_DATA_OPENCL_H
@@ -24,7 +24,7 @@ class btSoftBodyVertexDataOpenCL : public btSoftBodyVertexData
 {
 protected:
 	bool		m_onGPU;
-	cl::CommandQueue m_queue;
+	cl_command_queue	m_queue;
 public:
 	btOpenCLBuffer<int>									m_clClothIdentifier;
@@ -37,7 +37,7 @@ public:
 	btOpenCLBuffer<float>									m_clVertexArea;
 	btOpenCLBuffer<int>									m_clVertexTriangleCount;
 public:
-	btSoftBodyVertexDataOpenCL( cl::CommandQueue queue);
+	btSoftBodyVertexDataOpenCL( cl_command_queue queue,  cl_context ctx);
 	virtual ~btSoftBodyVertexDataOpenCL();
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
@@ -16,10 +16,18 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
 #include "vectormath/vmInclude.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolver_OpenCL.h"
+#include <stdio.h> //@todo: remove the debugging printf at some stage
-#include "BulletSoftBody/VertexBuffers/btSoftBodySolverVertexBuffer.h"
+#include "btSoftBodySolver_OpenCL.h"
 #include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
 #include "BulletSoftBody/btSoftBody.h"
     static const size_t workGroupSize = 128;
 //CL_VERSION_1_1 seems broken on NVidia SDK so just disable it
 #if (0)//CL_VERSION_1_1 == 1)
 //OpenCL 1.1 kernels use float3
 #define MSTRINGIFY(A) #A
 static char* PrepareLinksCLString = 
 #include "OpenCLC/PrepareLinks.cl"
@@ -41,19 +49,43 @@ static char* UpdateNormalsCLString =
 #include "OpenCLC/UpdateNormals.cl"
 static char* VSolveLinksCLString = 
 #include "OpenCLC/VSolveLinks.cl"
 #else
 ////OpenCL 1.0 kernels don't use float3
 #define MSTRINGIFY(A) #A
 static char* PrepareLinksCLString = 
 #include "OpenCLC10/PrepareLinks.cl"
 static char* UpdatePositionsFromVelocitiesCLString = 
 #include "OpenCLC10/UpdatePositionsFromVelocities.cl"
 static char* SolvePositionsCLString = 
 #include "OpenCLC10/SolvePositions.cl"
 static char* UpdateNodesCLString = 
 #include "OpenCLC10/UpdateNodes.cl"
 static char* UpdatePositionsCLString = 
 #include "OpenCLC10/UpdatePositions.cl"
 static char* UpdateConstantsCLString = 
 #include "OpenCLC10/UpdateConstants.cl"
 static char* IntegrateCLString = 
 #include "OpenCLC10/Integrate.cl"
 static char* ApplyForcesCLString = 
 #include "OpenCLC10/ApplyForces.cl"
 static char* UpdateNormalsCLString = 
 #include "OpenCLC10/UpdateNormals.cl"
 static char* VSolveLinksCLString = 
 #include "OpenCLC10/VSolveLinks.cl"
 #endif //CL_VERSION_1_1
-btSoftBodyVertexDataOpenCL::btSoftBodyVertexDataOpenCL( cl::CommandQueue queue) :
+btSoftBodyVertexDataOpenCL::btSoftBodyVertexDataOpenCL( cl_command_queue queue, cl_context ctx) :
    m_queue(queue),
-	m_clClothIdentifier( queue, &m_clothIdentifier, false ),
+	m_clClothIdentifier( queue, ctx, &m_clothIdentifier, false ),
-	m_clVertexPosition( queue, &m_vertexPosition, false ),
+	m_clVertexPosition( queue, ctx, &m_vertexPosition, false ),
-	m_clVertexPreviousPosition( queue, &m_vertexPreviousPosition, false ),
+	m_clVertexPreviousPosition( queue, ctx, &m_vertexPreviousPosition, false ),
-	m_clVertexVelocity( queue, &m_vertexVelocity, false ),
+	m_clVertexVelocity( queue, ctx, &m_vertexVelocity, false ),
-	m_clVertexForceAccumulator( queue, &m_vertexForceAccumulator, false ),
+	m_clVertexForceAccumulator( queue, ctx, &m_vertexForceAccumulator, false ),
-	m_clVertexNormal( queue, &m_vertexNormal, false ),
+	m_clVertexNormal( queue, ctx, &m_vertexNormal, false ),
-	m_clVertexInverseMass( queue, &m_vertexInverseMass, false ),
+	m_clVertexInverseMass( queue, ctx, &m_vertexInverseMass, false ),
-	m_clVertexArea( queue, &m_vertexArea, false ),
+	m_clVertexArea( queue, ctx, &m_vertexArea, false ),
-	m_clVertexTriangleCount( queue, &m_vertexTriangleCount, false )
+	m_clVertexTriangleCount( queue, ctx, &m_vertexTriangleCount, false )
 {
 }
@@ -108,16 +140,16 @@ bool btSoftBodyVertexDataOpenCL::moveFromAccelerator()
-btSoftBodyLinkDataOpenCL::btSoftBodyLinkDataOpenCL(cl::CommandQueue queue) :
+btSoftBodyLinkDataOpenCL::btSoftBodyLinkDataOpenCL(cl_command_queue queue,  cl_context ctx) 
-    m_queue(queue),
+:m_cqCommandQue(queue),
-	m_clLinks( queue, &m_links, false ),
+	m_clLinks( queue, ctx, &m_links, false ),
-	m_clLinkStrength( queue, &m_linkStrength, false ),
+	m_clLinkStrength( queue, ctx, &m_linkStrength, false ),
-	m_clLinksMassLSC( queue, &m_linksMassLSC, false ),
+	m_clLinksMassLSC( queue, ctx, &m_linksMassLSC, false ),
-	m_clLinksRestLengthSquared( queue, &m_linksRestLengthSquared, false ),
+	m_clLinksRestLengthSquared( queue, ctx, &m_linksRestLengthSquared, false ),
-	m_clLinksCLength( queue, &m_linksCLength, false ),
+	m_clLinksCLength( queue, ctx, &m_linksCLength, false ),
-	m_clLinksLengthRatio( queue, &m_linksLengthRatio, false ),
+	m_clLinksLengthRatio( queue, ctx, &m_linksLengthRatio, false ),
-	m_clLinksRestLength( queue, &m_linksRestLength, false ),
+	m_clLinksRestLength( queue, ctx, &m_linksRestLength, false ),
-	m_clLinksMaterialLinearStiffnessCoefficient( queue, &m_linksMaterialLinearStiffnessCoefficient, false )
+	m_clLinksMaterialLinearStiffnessCoefficient( queue, ctx, &m_linksMaterialLinearStiffnessCoefficient, false )
 {
 }
@@ -272,13 +304,13 @@ void btSoftBodyLinkDataOpenCL::generateBatches()
 	if( m_batchStartLengths.size() > 0 )
 	{
 		m_batchStartLengths.resize(batchCounts.size());
-		m_batchStartLengths[0] = std::pair< int, int >( 0, 0 );
+		m_batchStartLengths[0] = BatchPair(0, 0);
 		int sum = 0;
 		for( int batchIndex = 0; batchIndex < batchCounts.size(); ++batchIndex )
 		{
-			m_batchStartLengths[batchIndex].first = sum;
+			m_batchStartLengths[batchIndex].start = sum;
-			m_batchStartLengths[batchIndex].second = batchCounts[batchIndex];
+			m_batchStartLengths[batchIndex].length = batchCounts[batchIndex];
 			sum += batchCounts[batchIndex];
 		}
 	}
@@ -313,7 +345,7 @@ void btSoftBodyLinkDataOpenCL::generateBatches()
 		// next element in that batch, incrementing the batch counter
 		// afterwards
 		int batch = batchValues[linkIndex];
-		int newLocation = m_batchStartLengths[batch].first + batchCounts[batch];
+		int newLocation = m_batchStartLengths[batch].start + batchCounts[batch];
 		batchCounts[batch] = batchCounts[batch] + 1;
 		m_links[newLocation] = m_links_Backup[linkLocation];
@@ -336,11 +368,11 @@ void btSoftBodyLinkDataOpenCL::generateBatches()
-btSoftBodyTriangleDataOpenCL::btSoftBodyTriangleDataOpenCL( cl::CommandQueue queue ) : 
+btSoftBodyTriangleDataOpenCL::btSoftBodyTriangleDataOpenCL( cl_command_queue queue , cl_context ctx) : 
    m_queue( queue ),
-	m_clVertexIndices( queue, &m_vertexIndices, false ),
+	m_clVertexIndices( queue, ctx, &m_vertexIndices, false ),
-	m_clArea( queue, &m_area, false ),
+	m_clArea( queue, ctx, &m_area, false ),
-	m_clNormal( queue, &m_normal, false )
+	m_clNormal( queue, ctx, &m_normal, false )
 {
 }
@@ -493,7 +525,7 @@ void btSoftBodyTriangleDataOpenCL::generateBatches()
 	m_batchStartLengths.resize(batchCounts.size());
-	m_batchStartLengths[0] = std::pair< int, int >( 0, 0 );
+	m_batchStartLengths[0] = btSomePair(0,0);
 	int sum = 0;
@@ -547,18 +579,19 @@ void btSoftBodyTriangleDataOpenCL::generateBatches()
-btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(const cl::CommandQueue &queue) :
+btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_context ctx) :
-	m_linkData(queue),
+	m_linkData(queue, ctx),
-	m_vertexData(queue),
+	m_vertexData(queue, ctx),
-	m_triangleData(queue),
+	m_triangleData(queue, ctx),
-	m_clPerClothAcceleration(queue, &m_perClothAcceleration, true ),
+	m_clPerClothAcceleration(queue, ctx, &m_perClothAcceleration, true ),
-	m_clPerClothWindVelocity(queue, &m_perClothWindVelocity, true ),
+	m_clPerClothWindVelocity(queue, ctx, &m_perClothWindVelocity, true ),
-	m_clPerClothDampingFactor(queue, &m_perClothDampingFactor, true ),
+	m_clPerClothDampingFactor(queue,ctx, &m_perClothDampingFactor, true ),
-	m_clPerClothVelocityCorrectionCoefficient(queue, &m_perClothVelocityCorrectionCoefficient, true ),
+	m_clPerClothVelocityCorrectionCoefficient(queue, ctx,&m_perClothVelocityCorrectionCoefficient, true ),
-	m_clPerClothLiftFactor(queue, &m_perClothLiftFactor, true ),
+	m_clPerClothLiftFactor(queue, ctx,&m_perClothLiftFactor, true ),
-	m_clPerClothDragFactor(queue, &m_perClothDragFactor, true ),
+	m_clPerClothDragFactor(queue, ctx,&m_perClothDragFactor, true ),
-	m_clPerClothMediumDensity(queue, &m_perClothMediumDensity, true ),
+	m_clPerClothMediumDensity(queue, ctx,&m_perClothMediumDensity, true ),
-	m_queue( queue )
+	m_cqCommandQue( queue ),
 	m_cxMainContext(ctx)
 {
 	// Initial we will clearly need to update solver constants
 	// For now this is global for the cloths linked with this solver - we should probably make this body specific 
@@ -590,7 +623,7 @@ void btOpenCLSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &sof
 			using Vectormath::Aos::Point3;
 			// Create SoftBody that will store the information within the solver
-			btAcceleratedSoftBodyInterface *newSoftBody = new btAcceleratedSoftBodyInterface( softBody );
+			btOpenCLAcceleratedSoftBodyInterface *newSoftBody = new btOpenCLAcceleratedSoftBodyInterface( softBody );
 			m_softBodySet.push_back( newSoftBody );
 			m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) );
@@ -712,51 +745,58 @@ bool btOpenCLSoftBodySolver::checkInitialized()
 void btOpenCLSoftBodySolver::resetNormalsAndAreas( int numVertices )
 {
-	resetNormalsAndAreasKernel.kernel.setArg(0, numVertices);
+	cl_int ciErrNum;
-	resetNormalsAndAreasKernel.kernel.setArg(1, m_vertexData.m_clVertexNormal.getBuffer());
+	ciErrNum = clSetKernelArg(resetNormalsAndAreasKernel, 0, sizeof(numVertices), (void*)&numVertices); //oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	resetNormalsAndAreasKernel.kernel.setArg(2, m_vertexData.m_clVertexArea.getBuffer());
+	ciErrNum = clSetKernelArg(resetNormalsAndAreasKernel, 1, sizeof(cl_mem), (void*)&m_vertexData.m_clVertexNormal.m_buffer);//oclCHECKERROR(ciErrNum, CL_SUCCESS);
 	ciErrNum = clSetKernelArg(resetNormalsAndAreasKernel,  2, sizeof(cl_mem), (void*)&m_vertexData.m_clVertexArea.m_buffer); //oclCHECKERROR(ciErrNum, CL_SUCCESS);
 	size_t numWorkItems = workGroupSize*((numVertices + (workGroupSize-1)) / workGroupSize);
 	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, resetNormalsAndAreasKernel, 1, NULL, &numWorkItems, &workGroupSize, 0,0,0 );
-	int	numWorkItems = workGroupSize*((numVertices + (workGroupSize-1)) / workGroupSize);
+	if( ciErrNum != CL_SUCCESS )
 	cl_int err = m_queue.enqueueNDRangeKernel(resetNormalsAndAreasKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
 	if( err != CL_SUCCESS )
 	{
-		btAssert( "enqueueNDRangeKernel(resetNormalsAndAreasKernel)" );
+		btAssert( 0 && "enqueueNDRangeKernel(resetNormalsAndAreasKernel)" );
 	}
 }
 void btOpenCLSoftBodySolver::normalizeNormalsAndAreas( int numVertices )
 {
 	normalizeNormalsAndAreasKernel.kernel.setArg(0, numVertices);
 	normalizeNormalsAndAreasKernel.kernel.setArg(1, m_vertexData.m_clVertexTriangleCount.getBuffer());
 	normalizeNormalsAndAreasKernel.kernel.setArg(2, m_vertexData.m_clVertexNormal.getBuffer());
 	normalizeNormalsAndAreasKernel.kernel.setArg(3, m_vertexData.m_clVertexArea.getBuffer());
-	int	numWorkItems = workGroupSize*((numVertices + (workGroupSize-1)) / workGroupSize);
+	cl_int ciErrNum;
-	cl_int err = m_queue.enqueueNDRangeKernel(normalizeNormalsAndAreasKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
+
-	if( err != CL_SUCCESS ) 
+	ciErrNum = clSetKernelArg(normalizeNormalsAndAreasKernel, 0, sizeof(int),(void*) &numVertices);
 	ciErrNum = clSetKernelArg(normalizeNormalsAndAreasKernel, 1, sizeof(cl_mem), &m_vertexData.m_clVertexTriangleCount.m_buffer);
 	ciErrNum = clSetKernelArg(normalizeNormalsAndAreasKernel, 2, sizeof(cl_mem), &m_vertexData.m_clVertexNormal.m_buffer);
 	ciErrNum = clSetKernelArg(normalizeNormalsAndAreasKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexArea.m_buffer);
 	size_t	numWorkItems = workGroupSize*((numVertices + (workGroupSize-1)) / workGroupSize);
 	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, normalizeNormalsAndAreasKernel, 1, NULL, &numWorkItems, &workGroupSize, 0,0,0);
 	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert( "enqueueNDRangeKernel(normalizeNormalsAndAreasKernel)");
+		btAssert( 0 && "enqueueNDRangeKernel(normalizeNormalsAndAreasKernel)");
 	}
 }
 void btOpenCLSoftBodySolver::executeUpdateSoftBodies( int firstTriangle, int numTriangles )
 {
 	updateSoftBodiesKernel.kernel.setArg(0, firstTriangle);
 	updateSoftBodiesKernel.kernel.setArg(1, numTriangles);
 	updateSoftBodiesKernel.kernel.setArg(2, m_triangleData.m_clVertexIndices.getBuffer());
 	updateSoftBodiesKernel.kernel.setArg(3, m_vertexData.m_clVertexPosition.getBuffer());
 	updateSoftBodiesKernel.kernel.setArg(4, m_vertexData.m_clVertexNormal.getBuffer());
 	updateSoftBodiesKernel.kernel.setArg(5, m_vertexData.m_clVertexArea.getBuffer());
 	updateSoftBodiesKernel.kernel.setArg(6, m_triangleData.m_clNormal.getBuffer());
 	updateSoftBodiesKernel.kernel.setArg(7, m_triangleData.m_clArea.getBuffer());
 	cl_int ciErrNum;
 	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 0, sizeof(int), (void*) &firstTriangle);
 	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 1, sizeof(int), &numTriangles);
 	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 2, sizeof(cl_mem), &m_triangleData.m_clVertexIndices.m_buffer);
 	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
 	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 4, sizeof(cl_mem), &m_vertexData.m_clVertexNormal.m_buffer);
 	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexArea.m_buffer);
 	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 6, sizeof(cl_mem), &m_triangleData.m_clNormal.m_buffer);
 	ciErrNum = clSetKernelArg(updateSoftBodiesKernel, 7, sizeof(cl_mem), &m_triangleData.m_clArea.m_buffer);
-	int	numWorkItems = workGroupSize*((numTriangles + (workGroupSize-1)) / workGroupSize);
+	size_t numWorkItems = workGroupSize*((numTriangles + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(updateSoftBodiesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, updateSoftBodiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
-	if( err != CL_SUCCESS ) 
+	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(normalizeNormalsAndAreasKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(normalizeNormalsAndAreasKernel)");
 	}
 }
 void btOpenCLSoftBodySolver::updateSoftBodies()
@@ -807,6 +847,7 @@ void btOpenCLSoftBodySolver::ApplyClampedForce( float solverdt, const Vectormath
 void btOpenCLSoftBodySolver::applyForces( float solverdt )
 {	
 	// Ensure data is on accelerator
 	m_vertexData.moveToAccelerator();
 	m_clPerClothAcceleration.moveToGPU();
@@ -815,85 +856,30 @@ void btOpenCLSoftBodySolver::applyForces( float solverdt )
 	m_clPerClothMediumDensity.moveToGPU();
 	m_clPerClothWindVelocity.moveToGPU();			
-	cl_int err;
+	cl_int ciErrNum ;
-	err = applyForcesKernel.kernel.setArg(0, m_vertexData.getNumVertices());
+	int numVerts = m_vertexData.getNumVertices();
-	if( err != CL_SUCCESS ) 
+	ciErrNum = clSetKernelArg(applyForcesKernel, 0, sizeof(int), &numVerts);
 	ciErrNum = clSetKernelArg(applyForcesKernel, 1, sizeof(float), &solverdt);
 	float fl = FLT_EPSILON;
 	ciErrNum = clSetKernelArg(applyForcesKernel, 2, sizeof(float), &fl);
 	ciErrNum = clSetKernelArg(applyForcesKernel, 3, sizeof(cl_mem), &m_vertexData.m_clClothIdentifier.m_buffer);
 	ciErrNum = clSetKernelArg(applyForcesKernel, 4, sizeof(cl_mem), &m_vertexData.m_clVertexNormal.m_buffer);
 	ciErrNum = clSetKernelArg(applyForcesKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexArea.m_buffer);
 	ciErrNum = clSetKernelArg(applyForcesKernel, 6, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
 	ciErrNum = clSetKernelArg(applyForcesKernel, 7, sizeof(cl_mem), &m_clPerClothLiftFactor.m_buffer);
 	ciErrNum = clSetKernelArg(applyForcesKernel, 8 ,sizeof(cl_mem), &m_clPerClothDragFactor.m_buffer);
 	ciErrNum = clSetKernelArg(applyForcesKernel, 9, sizeof(cl_mem), &m_clPerClothWindVelocity.m_buffer);
 	ciErrNum = clSetKernelArg(applyForcesKernel,10, sizeof(cl_mem), &m_clPerClothAcceleration.m_buffer);
 	ciErrNum = clSetKernelArg(applyForcesKernel,11, sizeof(cl_mem), &m_clPerClothMediumDensity.m_buffer);
 	ciErrNum = clSetKernelArg(applyForcesKernel,12, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);
 	ciErrNum = clSetKernelArg(applyForcesKernel,13, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
 	size_t numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
 	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,applyForcesKernel, 1, NULL, &numWorkItems, &workGroupSize, 0,0,0);
 	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 	err = applyForcesKernel.kernel.setArg(1, solverdt);
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 	err = applyForcesKernel.kernel.setArg(2, FLT_EPSILON);
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 	err = applyForcesKernel.kernel.setArg(3, m_vertexData.m_clClothIdentifier.getBuffer());
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 	err = applyForcesKernel.kernel.setArg(4, m_vertexData.m_clVertexNormal.getBuffer());
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 	err = applyForcesKernel.kernel.setArg(5, m_vertexData.m_clVertexArea.getBuffer());
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 	err = applyForcesKernel.kernel.setArg(6, m_vertexData.m_clVertexInverseMass.getBuffer());
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 	err = applyForcesKernel.kernel.setArg(7, m_clPerClothLiftFactor.getBuffer());
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 	err = applyForcesKernel.kernel.setArg(8, m_clPerClothDragFactor.getBuffer());
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 	err = applyForcesKernel.kernel.setArg(9, m_clPerClothWindVelocity.getBuffer());
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 	err = applyForcesKernel.kernel.setArg(10, m_clPerClothAcceleration.getBuffer());
 	if( err != CL_SUCCESS )
 	{
 		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 	err = applyForcesKernel.kernel.setArg(11, m_clPerClothMediumDensity.getBuffer());
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 	err = applyForcesKernel.kernel.setArg(12, m_vertexData.m_clVertexForceAccumulator.getBuffer());
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 	err = applyForcesKernel.kernel.setArg(13, m_vertexData.m_clVertexVelocity.getBuffer());
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
 	err = m_queue.enqueueNDRangeKernel(applyForcesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert(  "enqueueNDRangeKernel(applyForcesKernel)");
 	}
 }
 /**
@@ -901,22 +887,26 @@ void btOpenCLSoftBodySolver::applyForces( float solverdt )
 */
 void btOpenCLSoftBodySolver::integrate( float solverdt )
 {
 	// Ensure data is on accelerator
 	m_vertexData.moveToAccelerator();
-	integrateKernel.kernel.setArg(0, m_vertexData.getNumVertices());
+	cl_int ciErrNum;
-	integrateKernel.kernel.setArg(1, solverdt);
+	int numVerts = m_vertexData.getNumVertices();
-	integrateKernel.kernel.setArg(2, m_vertexData.m_clVertexInverseMass.getBuffer());
+	ciErrNum = clSetKernelArg(integrateKernel, 0, sizeof(int), &numVerts);
-	integrateKernel.kernel.setArg(3, m_vertexData.m_clVertexPosition.getBuffer());
+	ciErrNum = clSetKernelArg(integrateKernel, 1, sizeof(float), &solverdt);
-	integrateKernel.kernel.setArg(4, m_vertexData.m_clVertexVelocity.getBuffer());
+	ciErrNum = clSetKernelArg(integrateKernel, 2, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
-	integrateKernel.kernel.setArg(5, m_vertexData.m_clVertexPreviousPosition.getBuffer());
+	ciErrNum = clSetKernelArg(integrateKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
-	integrateKernel.kernel.setArg(6, m_vertexData.m_clVertexForceAccumulator.getBuffer());
+	ciErrNum = clSetKernelArg(integrateKernel, 4, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
 	ciErrNum = clSetKernelArg(integrateKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
 	ciErrNum = clSetKernelArg(integrateKernel, 6, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);
-	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	size_t numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
-	cl_int err = m_queue.enqueueNDRangeKernel(integrateKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
+	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,integrateKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
-	if( err != CL_SUCCESS )
+	if( ciErrNum != CL_SUCCESS )
 	{
-		btAssert(  "enqueueNDRangeKernel(integrateKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(integrateKernel)");
 	}
 }
@@ -935,6 +925,7 @@ float btOpenCLSoftBodySolver::computeTriangleArea(
 void btOpenCLSoftBodySolver::updateConstants( float timeStep )
 {			
 	using namespace Vectormath::Aos;
 	if( m_updateSolverConstants )
@@ -959,10 +950,12 @@ void btOpenCLSoftBodySolver::updateConstants( float timeStep )
 			m_linkData.getRestLengthSquared(linkIndex) = restLengthSquared;
 		}
 	}
 }
 void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 {
 	using Vectormath::Aos::Vector3;
 	using Vectormath::Aos::Point3;
 	using Vectormath::Aos::lengthSqr;
@@ -988,33 +981,34 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
-	// Prepare anchors
+	for( int iteration = 0; iteration < m_numberOfVelocityIterations ; ++iteration )
 	/*for(i=0,ni=m_anchors.size();i<ni;++i)
 	{
-		Anchor&			a=m_anchors[i];
+		for( int i = 0; i < m_linkData.m_batchStartLengths.size(); ++i )
-		const btVector3	ra=a.m_body->getWorldTransform().getBasis()*a.m_local;
+		{
-		a.m_c0	=	ImpulseMatrix(	m_sst.sdt,
+			int startLink = m_linkData.m_batchStartLengths[i].start;
-			a.m_node->m_im,
+			int numLinks = m_linkData.m_batchStartLengths[i].length;
 			a.m_body->getInvMass(),
 			a.m_body->getInvInertiaTensorWorld(),
 			ra);
 		a.m_c1	=	ra;
 		a.m_c2	=	m_sst.sdt*a.m_node->m_im;
 		a.m_body->activate();
 	}*/
-	// Really want to combine these into a single loop, don't we? No update in the middle?
+			solveLinksForVelocity( startLink, numLinks, kst );
-
+		}
-	// TODO: Double check what kst is meant to mean - passed in as 1 in the bullet code
+	}
 	// Compute new positions from velocity
 	// Also update the previous position so that our position computation is now based on the new position from the velocity solution
 	// rather than based directly on the original positions
 	if( m_numberOfVelocityIterations > 0 )
 	{
 		updateVelocitiesFromPositionsWithVelocities( 1.f/solverdt );
 	} else {
 		updateVelocitiesFromPositionsWithoutVelocities( 1.f/solverdt );
 	}
 	// Solve drift
 	for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
 	{
 		for( int i = 0; i < m_linkData.m_batchStartLengths.size(); ++i )
 		{
-			int startLink = m_linkData.m_batchStartLengths[i].first;
+			int startLink = m_linkData.m_batchStartLengths[i].start;
-			int numLinks = m_linkData.m_batchStartLengths[i].second;
+			int numLinks = m_linkData.m_batchStartLengths[i].length;
 			solveLinksForPosition( startLink, numLinks, kst, ti );
 		}
@@ -1023,6 +1017,7 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 	updateVelocitiesFromPositionsWithoutVelocities( 1.f/solverdt );
 }
@@ -1030,96 +1025,136 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 // Kernel dispatches
 void btOpenCLSoftBodySolver::prepareLinks()
 {
 	prepareLinksKernel.kernel.setArg(0, m_linkData.getNumLinks());
 	prepareLinksKernel.kernel.setArg(1, m_linkData.m_clLinks.getBuffer());
 	prepareLinksKernel.kernel.setArg(2, m_linkData.m_clLinksMassLSC.getBuffer());
 	prepareLinksKernel.kernel.setArg(3, m_vertexData.m_clVertexPreviousPosition.getBuffer());
 	prepareLinksKernel.kernel.setArg(4, m_linkData.m_clLinksLengthRatio.getBuffer());
 	prepareLinksKernel.kernel.setArg(5, m_linkData.m_clLinksCLength.getBuffer());
-	int	numWorkItems = workGroupSize*((m_linkData.getNumLinks() + (workGroupSize-1)) / workGroupSize);
+	cl_int ciErrNum;
-	cl_int err = m_queue.enqueueNDRangeKernel(prepareLinksKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
+	int numLinks = m_linkData.getNumLinks();
-	if( err != CL_SUCCESS ) 
+	ciErrNum = clSetKernelArg(prepareLinksKernel,0, sizeof(int), &numLinks);
 	ciErrNum = clSetKernelArg(prepareLinksKernel,1, sizeof(cl_mem), &m_linkData.m_clLinks.m_buffer);
 	ciErrNum = clSetKernelArg(prepareLinksKernel,2, sizeof(cl_mem), &m_linkData.m_clLinksMassLSC.m_buffer);
 	ciErrNum = clSetKernelArg(prepareLinksKernel,3, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
 	ciErrNum = clSetKernelArg(prepareLinksKernel,4, sizeof(cl_mem), &m_linkData.m_clLinksLengthRatio.m_buffer);
 	ciErrNum = clSetKernelArg(prepareLinksKernel,5, sizeof(cl_mem), &m_linkData.m_clLinksCLength.m_buffer);
 	size_t	numWorkItems = workGroupSize*((m_linkData.getNumLinks() + (workGroupSize-1)) / workGroupSize);
 	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,prepareLinksKernel, 1 , NULL, &numWorkItems, &workGroupSize,0,0,0);
 	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(prepareLinksKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(prepareLinksKernel)");
 	}
 }
 void btOpenCLSoftBodySolver::updatePositionsFromVelocities( float solverdt )
 {
 	updatePositionsFromVelocitiesKernel.kernel.setArg(0, m_vertexData.getNumVertices());
 	updatePositionsFromVelocitiesKernel.kernel.setArg(1, solverdt);
 	updatePositionsFromVelocitiesKernel.kernel.setArg(2, m_vertexData.m_clVertexVelocity.getBuffer());
 	updatePositionsFromVelocitiesKernel.kernel.setArg(3, m_vertexData.m_clVertexPreviousPosition.getBuffer());
 	updatePositionsFromVelocitiesKernel.kernel.setArg(4, m_vertexData.m_clVertexPosition.getBuffer());
-	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	cl_int ciErrNum;
-	cl_int err = m_queue.enqueueNDRangeKernel(updatePositionsFromVelocitiesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
+	int numVerts = m_vertexData.getNumVertices();
-	if( err != CL_SUCCESS ) 
+	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,0, sizeof(int), &numVerts);
 	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,1, sizeof(float), &solverdt);
 	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,2, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
 	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,3, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
 	ciErrNum = clSetKernelArg(updatePositionsFromVelocitiesKernel,4, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
 	size_t	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
 	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,updatePositionsFromVelocitiesKernel, 1, NULL, &numWorkItems,&workGroupSize,0,0,0);
 	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(updatePositionsFromVelocitiesKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(updatePositionsFromVelocitiesKernel)");
 	}
 }
 void btOpenCLSoftBodySolver::solveLinksForPosition( int startLink, int numLinks, float kst, float ti )
 {
 	solvePositionsFromLinksKernel.kernel.setArg(0, startLink);
 	solvePositionsFromLinksKernel.kernel.setArg(1, numLinks);
 	solvePositionsFromLinksKernel.kernel.setArg(2, kst);
 	solvePositionsFromLinksKernel.kernel.setArg(3, ti);
 	solvePositionsFromLinksKernel.kernel.setArg(4, m_linkData.m_clLinks.getBuffer());
 	solvePositionsFromLinksKernel.kernel.setArg(5, m_linkData.m_clLinksMassLSC.getBuffer());
 	solvePositionsFromLinksKernel.kernel.setArg(6, m_linkData.m_clLinksRestLengthSquared.getBuffer());
 	solvePositionsFromLinksKernel.kernel.setArg(7, m_vertexData.m_clVertexInverseMass.getBuffer());
 	solvePositionsFromLinksKernel.kernel.setArg(8, m_vertexData.m_clVertexPosition.getBuffer());
-	int	numWorkItems = workGroupSize*((numLinks + (workGroupSize-1)) / workGroupSize);
+	cl_int ciErrNum;
-	cl_int err = m_queue.enqueueNDRangeKernel(solvePositionsFromLinksKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,0, sizeof(int), &startLink);
-	if( err != CL_SUCCESS ) 
+	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,1, sizeof(int), &numLinks);
 	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,2, sizeof(float), &kst);
 	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,3, sizeof(float), &ti);
 	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,4, sizeof(cl_mem), &m_linkData.m_clLinks.m_buffer);
 	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,5, sizeof(cl_mem), &m_linkData.m_clLinksMassLSC.m_buffer);
 	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,6, sizeof(cl_mem), &m_linkData.m_clLinksRestLengthSquared.m_buffer);
 	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,7, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
 	ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,8, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
 	size_t	numWorkItems = workGroupSize*((numLinks + (workGroupSize-1)) / workGroupSize);
 	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,solvePositionsFromLinksKernel,1,NULL,&numWorkItems,&workGroupSize,0,0,0);
 	if( ciErrNum!= CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(solvePositionsFromLinksKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(solvePositionsFromLinksKernel)");
 	}
 } // solveLinksForPosition
 void btOpenCLSoftBodySolver::solveLinksForVelocity( int startLink, int numLinks, float kst )
 {
 	cl_int ciErrNum;
 	ciErrNum = clSetKernelArg(vSolveLinksKernel, 0, sizeof(int), &startLink);
 	ciErrNum = clSetKernelArg(vSolveLinksKernel, 1, sizeof(int), &numLinks);
 	ciErrNum = clSetKernelArg(vSolveLinksKernel, 2, sizeof(cl_mem), &m_linkData.m_clLinks.m_buffer);
 	ciErrNum = clSetKernelArg(vSolveLinksKernel, 3, sizeof(cl_mem), &m_linkData.m_clLinksLengthRatio.m_buffer);
 	ciErrNum = clSetKernelArg(vSolveLinksKernel, 4, sizeof(cl_mem), &m_linkData.m_clLinksCLength.m_buffer);
 	ciErrNum = clSetKernelArg(vSolveLinksKernel, 5, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
 	ciErrNum = clSetKernelArg(vSolveLinksKernel, 6, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
 	size_t	numWorkItems = workGroupSize*((numLinks + (workGroupSize-1)) / workGroupSize);
 	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,vSolveLinksKernel,1,NULL,&numWorkItems, &workGroupSize,0,0,0);
 	if( ciErrNum != CL_SUCCESS ) 
 	{
 		btAssert( 0 &&  "enqueueNDRangeKernel(vSolveLinksKernel)");
 	}
 }
 void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithVelocities( float isolverdt )
 {
 	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(0, m_vertexData.getNumVertices());
 	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(1, isolverdt);
 	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(2, m_vertexData.m_clVertexPosition.getBuffer());
 	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(3, m_vertexData.m_clVertexPreviousPosition.getBuffer());
 	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(4, m_vertexData.m_clClothIdentifier.getBuffer());
 	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(5, m_clPerClothVelocityCorrectionCoefficient.getBuffer());
 	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(6, m_clPerClothDampingFactor.getBuffer());
 	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(7, m_vertexData.m_clVertexVelocity.getBuffer());
 	updateVelocitiesFromPositionsWithVelocitiesKernel.kernel.setArg(8, m_vertexData.m_clVertexForceAccumulator.getBuffer());
-	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	cl_int ciErrNum;
-	cl_int err = m_queue.enqueueNDRangeKernel(updateVelocitiesFromPositionsWithVelocitiesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
+	int numVerts = m_vertexData.getNumVertices();
-	if( err != CL_SUCCESS ) 
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel,0, sizeof(int), &numVerts);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 1, sizeof(float), &isolverdt);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 2, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexPreviousPosition.m_buffer);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 4, sizeof(cl_mem), &m_vertexData.m_clClothIdentifier.m_buffer);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 5, sizeof(cl_mem), &m_clPerClothVelocityCorrectionCoefficient.m_buffer);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 6, sizeof(cl_mem), &m_clPerClothDampingFactor.m_buffer);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 7, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithVelocitiesKernel, 8, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);
 	size_t	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
 	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,updateVelocitiesFromPositionsWithVelocitiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
 	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(updateVelocitiesFromPositionsWithVelocitiesKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(updateVelocitiesFromPositionsWithVelocitiesKernel)");
 	}
 } // updateVelocitiesFromPositionsWithVelocities
 void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities( float isolverdt )
 {
 	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(0, m_vertexData.getNumVertices());
 	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(1, isolverdt);
 	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(2, m_vertexData.m_clVertexPosition.getBuffer());
 	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(3, m_vertexData.m_clVertexPreviousPosition.getBuffer());
 	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(4, m_vertexData.m_clClothIdentifier.getBuffer());
 	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(5, m_clPerClothDampingFactor.getBuffer());
 	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(6, m_vertexData.m_clVertexVelocity.getBuffer());
 	updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel.setArg(7, m_vertexData.m_clVertexForceAccumulator.getBuffer());
-	int	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
+	cl_int ciErrNum;
-	cl_int err = m_queue.enqueueNDRangeKernel(updateVelocitiesFromPositionsWithoutVelocitiesKernel.kernel, cl::NullRange, cl::NDRange(numWorkItems), cl::NDRange(workGroupSize));
+	int numVerts = m_vertexData.getNumVertices();
-	if( err != CL_SUCCESS ) 
+	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 0, sizeof(int), &numVerts);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 1, sizeof(float), &isolverdt);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 2, sizeof(cl_mem),&m_vertexData.m_clVertexPosition.m_buffer);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 3, sizeof(cl_mem),&m_vertexData.m_clVertexPreviousPosition.m_buffer);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 4, sizeof(cl_mem),&m_vertexData.m_clClothIdentifier.m_buffer);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 5, sizeof(cl_mem),&m_clPerClothDampingFactor.m_buffer);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 6, sizeof(cl_mem),&m_vertexData.m_clVertexVelocity.m_buffer);
 	ciErrNum = clSetKernelArg(updateVelocitiesFromPositionsWithoutVelocitiesKernel, 7, sizeof(cl_mem),&m_vertexData.m_clVertexForceAccumulator.m_buffer);
 	size_t	numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
 	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,updateVelocitiesFromPositionsWithoutVelocitiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
 	if( ciErrNum != CL_SUCCESS ) 
 	{
-		btAssert(  "enqueueNDRangeKernel(updateVelocitiesFromPositionsWithoutVelocitiesKernel)");
+		btAssert( 0 &&  "enqueueNDRangeKernel(updateVelocitiesFromPositionsWithoutVelocitiesKernel)");
 	}
 } // updateVelocitiesFromPositionsWithoutVelocities
 // End kernel dispatches
@@ -1133,15 +1168,20 @@ void btOpenCLSoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * cons
 	// and use them together on a single kernel call if possible by setting up a
 	// per-cloth target buffer array for the copy kernel.
-	btAcceleratedSoftBodyInterface *currentCloth = findSoftBodyInterface( softBody );
+
 	btOpenCLAcceleratedSoftBodyInterface *currentCloth = findSoftBodyInterface( softBody );
 	const int firstVertex = currentCloth->getFirstVertex();
 	const int lastVertex = firstVertex + currentCloth->getNumVertices();
 	if( vertexBuffer->getBufferType() == btVertexBufferDescriptor::CPU_BUFFER )
 	{		
 		const int firstVertex = currentCloth->getFirstVertex();
 		const int lastVertex = firstVertex + currentCloth->getNumVertices();
 		const btCPUVertexBufferDescriptor *cpuVertexBuffer = static_cast< btCPUVertexBufferDescriptor* >(vertexBuffer);						
 		float *basePointer = cpuVertexBuffer->getBasePointer();						
 		m_vertexData.m_clVertexPosition.copyFromGPU();
 		m_vertexData.m_clVertexNormal.copyFromGPU();
 		if( vertexBuffer->hasVertexPositions() )
 		{
 			const int vertexOffset = cpuVertexBuffer->getVertexOffset();
@@ -1173,43 +1213,46 @@ void btOpenCLSoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * cons
 			}
 		}
 	}
 } // btCPUSoftBodySolver::outputToVertexBuffers
-btOpenCLSoftBodySolver::KernelDesc btOpenCLSoftBodySolver::compileCLKernelFromString( const char *shaderString, const char *shaderName )
+cl_kernel btOpenCLSoftBodySolver::compileCLKernelFromString( const char* kernelSource, const char* kernelName )
 {
-	cl_int err;
+	printf("compiling kernalName: %s ",kernelName);
 	cl_kernel kernel;
 	cl_int ciErrNum;
 	size_t program_length = strlen(kernelSource);
-	context = m_queue.getInfo<CL_QUEUE_CONTEXT>();
+	cl_program m_cpProgram = clCreateProgramWithSource(m_cxMainContext, 1, (const char**)&kernelSource, &program_length, &ciErrNum);
-	device = m_queue.getInfo<CL_QUEUE_DEVICE>();
+//	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-	std::vector< cl::Device > devices;
+		
-	devices.push_back( device );
+    // Build the program with 'mad' Optimization option
 #ifdef MAC
 	char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
 #else
 	const char* flags = "-DGUID_ARG=";
 #endif
    ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, flags, NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
 		btAssert(0);
        exit(0);
    }
    // Create the kernel
    kernel = clCreateKernel(m_cpProgram, kernelName, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
 		btAssert(0);
 		exit(0);
    }
-	cl::Program::Sources source(1, std::make_pair(shaderString, strlen(shaderString) + 1));
+	printf("ready. \n");
-	cl::Program program(context, source, &err);
+	return kernel;
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert( "program" );
 	}
 	err = program.build(devices);
 	if (err != CL_SUCCESS) {
 		//std::string str;
 		//str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
 		//std::cout << "Program Info: " << str;
 		if( err != CL_SUCCESS ) 
 		{
 			btAssert( "Program::build()" );
 		}
 	}
 	cl::Kernel kernel(program, shaderName, &err);
 	if( err != CL_SUCCESS ) 
 	{
 		btAssert( "kernel" );
 	}
 	KernelDesc descriptor;
 	descriptor.kernel = kernel;
 	return descriptor;
 }
 void btOpenCLSoftBodySolver::predictMotion( float timeStep )
@@ -1234,11 +1277,11 @@ void btOpenCLSoftBodySolver::predictMotion( float timeStep )
-btOpenCLSoftBodySolver::btAcceleratedSoftBodyInterface *btOpenCLSoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
+btOpenCLAcceleratedSoftBodyInterface *btOpenCLSoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
 {
 	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
 	{
-		btAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
+		btOpenCLAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
 		if( softBodyInterface->getSoftBody() == softBody )
 			return softBodyInterface;
 	}
@@ -1273,4 +1316,4 @@ bool btOpenCLSoftBodySolver::buildShaders()
 		m_shadersInitialized = true;
 	return returnVal;
-}
+}
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
@@ -16,204 +16,165 @@ subject to the following restrictions:
 #ifndef BT_SOFT_BODY_SOLVER_OPENCL_H
 #define BT_SOFT_BODY_SOLVER_OPENCL_H
 #include "stddef.h" //for size_t
 #include "vectormath/vmInclude.h"
 #include "BulletSoftBody/btSoftBodySolvers.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h"
+#include "btSoftBodySolverBuffer_OpenCL.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolverLinkData_OpenCL.h"
+#include "btSoftBodySolverLinkData_OpenCL.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolverVertexData_OpenCL.h"
+#include "btSoftBodySolverVertexData_OpenCL.h"
-#include "BulletSoftBody/solvers/OpenCL/btSoftBodySolverTriangleData_OpenCL.h"
+#include "btSoftBodySolverTriangleData_OpenCL.h"
 /**
 * SoftBody class to maintain information about a soft body instance
 * within a solver.
 * This data addresses the main solver arrays.
 */
 class btOpenCLAcceleratedSoftBodyInterface
 {
 protected:
 	/** Current number of vertices that are part of this cloth */
 	int m_numVertices;
 	/** Maximum number of vertices allocated to be part of this cloth */
 	int m_maxVertices;
 	/** Current number of triangles that are part of this cloth */
 	int m_numTriangles;
 	/** Maximum number of triangles allocated to be part of this cloth */
 	int m_maxTriangles;
 	/** Index of first vertex in the world allocated to this cloth */
 	int m_firstVertex;
 	/** Index of first triangle in the world allocated to this cloth */
 	int m_firstTriangle;
 	/** Index of first link in the world allocated to this cloth */
 	int m_firstLink;
 	/** Maximum number of links allocated to this cloth */
 	int m_maxLinks;
 	/** Current number of links allocated to this cloth */
 	int m_numLinks;
 	/** The actual soft body this data represents */
 	btSoftBody *m_softBody;
 public:
 	btOpenCLAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
 	  m_softBody( softBody )
 	{
 		m_numVertices = 0;
 		m_maxVertices = 0;
 		m_numTriangles = 0;
 		m_maxTriangles = 0;
 		m_firstVertex = 0;
 		m_firstTriangle = 0;
 		m_firstLink = 0;
 		m_maxLinks = 0;
 		m_numLinks = 0;
 	}
 	int getNumVertices()
 	{
 		return m_numVertices;
 	}
 	int getNumTriangles()
 	{
 		return m_numTriangles;
 	}
 	int getMaxVertices()
 	{
 		return m_maxVertices;
 	}
 	int getMaxTriangles()
 	{
 		return m_maxTriangles;
 	}
 	int getFirstVertex()
 	{
 		return m_firstVertex;
 	}
 	int getFirstTriangle()
 	{
 		return m_firstTriangle;
 	}
 	// TODO: All of these set functions will have to do checks and
 	// update the world because restructuring of the arrays will be necessary
 	// Reasonable use of "friend"?
 	void setNumVertices( int numVertices )
 	{
 		m_numVertices = numVertices;
 	}	
 	void setNumTriangles( int numTriangles )
 	{
 		m_numTriangles = numTriangles;
 	}
 	void setMaxVertices( int maxVertices )
 	{
 		m_maxVertices = maxVertices;
 	}
 	void setMaxTriangles( int maxTriangles )
 	{
 		m_maxTriangles = maxTriangles;
 	}
 	void setFirstVertex( int firstVertex )
 	{
 		m_firstVertex = firstVertex;
 	}
 	void setFirstTriangle( int firstTriangle )
 	{
 		m_firstTriangle = firstTriangle;
 	}
 	void setMaxLinks( int maxLinks )
 	{
 		m_maxLinks = maxLinks;
 	}
 	void setNumLinks( int numLinks )
 	{
 		m_numLinks = numLinks;
 	}
 	void setFirstLink( int firstLink )
 	{
 		m_firstLink = firstLink;
 	}
 	int getMaxLinks()
 	{
 		return m_maxLinks;
 	}
 	int getNumLinks()
 	{
 		return m_numLinks;
 	}
 	int getFirstLink()
 	{
 		return m_firstLink;
 	}
 	btSoftBody* getSoftBody()
 	{
 		return m_softBody;
 	}
 };
 class btOpenCLSoftBodySolver : public btSoftBodySolver
 {
 private:
 	/**
 	 * SoftBody class to maintain information about a soft body instance
 	 * within a solver.
 	 * This data addresses the main solver arrays.
 	 */
 	class btAcceleratedSoftBodyInterface
 	{
 	protected:
 		/** Current number of vertices that are part of this cloth */
 		int m_numVertices;
 		/** Maximum number of vertices allocated to be part of this cloth */
 		int m_maxVertices;
 		/** Current number of triangles that are part of this cloth */
 		int m_numTriangles;
 		/** Maximum number of triangles allocated to be part of this cloth */
 		int m_maxTriangles;
 		/** Index of first vertex in the world allocated to this cloth */
 		int m_firstVertex;
 		/** Index of first triangle in the world allocated to this cloth */
 		int m_firstTriangle;
 		/** Index of first link in the world allocated to this cloth */
 		int m_firstLink;
 		/** Maximum number of links allocated to this cloth */
 		int m_maxLinks;
 		/** Current number of links allocated to this cloth */
 		int m_numLinks;
 		/** The actual soft body this data represents */
 		btSoftBody *m_softBody;
 	public:
 		btAcceleratedSoftBodyInterface( btSoftBody *softBody ) :
 		  m_softBody( softBody )
 		{
 			m_numVertices = 0;
 			m_maxVertices = 0;
 			m_numTriangles = 0;
 			m_maxTriangles = 0;
 			m_firstVertex = 0;
 			m_firstTriangle = 0;
 			m_firstLink = 0;
 			m_maxLinks = 0;
 			m_numLinks = 0;
 		}
 		int getNumVertices()
 		{
 			return m_numVertices;
 		}
 		int getNumTriangles()
 		{
 			return m_numTriangles;
 		}
 		int getMaxVertices()
 		{
 			return m_maxVertices;
 		}
 		int getMaxTriangles()
 		{
 			return m_maxTriangles;
 		}
 		int getFirstVertex()
 		{
 			return m_firstVertex;
 		}
 		int getFirstTriangle()
 		{
 			return m_firstTriangle;
 		}
 		// TODO: All of these set functions will have to do checks and
 		// update the world because restructuring of the arrays will be necessary
 		// Reasonable use of "friend"?
 		void setNumVertices( int numVertices )
 		{
 			m_numVertices = numVertices;
 		}	
 		void setNumTriangles( int numTriangles )
 		{
 			m_numTriangles = numTriangles;
 		}
 		void setMaxVertices( int maxVertices )
 		{
 			m_maxVertices = maxVertices;
 		}
 		void setMaxTriangles( int maxTriangles )
 		{
 			m_maxTriangles = maxTriangles;
 		}
 		void setFirstVertex( int firstVertex )
 		{
 			m_firstVertex = firstVertex;
 		}
 		void setFirstTriangle( int firstTriangle )
 		{
 			m_firstTriangle = firstTriangle;
 		}
 		void setMaxLinks( int maxLinks )
 		{
 			m_maxLinks = maxLinks;
 		}
 		void setNumLinks( int numLinks )
 		{
 			m_numLinks = numLinks;
 		}
 		void setFirstLink( int firstLink )
 		{
 			m_firstLink = firstLink;
 		}
 		int getMaxLinks()
 		{
 			return m_maxLinks;
 		}
 		int getNumLinks()
 		{
 			return m_numLinks;
 		}
 		int getFirstLink()
 		{
 			return m_firstLink;
 		}
 		btSoftBody* getSoftBody()
 		{
 			return m_softBody;
 		}
 	#if 0
 		void setAcceleration( Vectormath::Aos::Vector3 acceleration )
 		{
 			m_currentSolver->setPerClothAcceleration( m_clothIdentifier, acceleration );
 		}
 		void setWindVelocity( Vectormath::Aos::Vector3 windVelocity )
 		{
 			m_currentSolver->setPerClothWindVelocity( m_clothIdentifier, windVelocity );
 		}
 		/** 
 		 * Set the density of the air in which the cloth is situated.
 		 */
 		void setAirDensity( btScalar density )
 		{
 			m_currentSolver->setPerClothMediumDensity( m_clothIdentifier, static_cast<float>(density) );
 		}
 		/**
 		 * Add a collision object to this soft body.
 		 */
 		void addCollisionObject( btCollisionObject *collisionObject )
 		{
 			m_currentSolver->addCollisionObjectForSoftBody( m_clothIdentifier, collisionObject );
 		}
 	#endif
 	};
 	class KernelDesc
 	{
 	protected:
 	public:
 		cl::Kernel kernel;
 		KernelDesc()
 		{
 		}
 		virtual ~KernelDesc()
 		{
 		}
 	}; 
 	btSoftBodyLinkDataOpenCL m_linkData;
 	btSoftBodyVertexDataOpenCL m_vertexData;
@@ -228,7 +189,7 @@ private:
 	 * Cloths owned by this solver.
 	 * Only our cloths are in this array.
 	 */
-	btAlignedObjectArray< btAcceleratedSoftBodyInterface * > m_softBodySet;
+	btAlignedObjectArray< btOpenCLAcceleratedSoftBodyInterface * > m_softBodySet;
 	/** Acceleration value to be applied to all non-static vertices in the solver. 
 	 * Index n is cloth n, array sized by number of cloths in the world not the solver. 
@@ -262,37 +223,34 @@ private:
 	btAlignedObjectArray< float >						m_perClothMediumDensity;
 	btOpenCLBuffer<float>								m_clPerClothMediumDensity;
-	KernelDesc		prepareLinksKernel;
+	cl_kernel		prepareLinksKernel;
-	KernelDesc		solvePositionsFromLinksKernel;
+	cl_kernel		solvePositionsFromLinksKernel;
-	KernelDesc		updateConstantsKernel;
+	cl_kernel		updateConstantsKernel;
-	KernelDesc		integrateKernel;
+	cl_kernel		integrateKernel;
-	KernelDesc		addVelocityKernel;
+	cl_kernel		addVelocityKernel;
-	KernelDesc		updatePositionsFromVelocitiesKernel;
+	cl_kernel		updatePositionsFromVelocitiesKernel;
-	KernelDesc		updateVelocitiesFromPositionsWithoutVelocitiesKernel;
+	cl_kernel		updateVelocitiesFromPositionsWithoutVelocitiesKernel;
-	KernelDesc		updateVelocitiesFromPositionsWithVelocitiesKernel;
+	cl_kernel		updateVelocitiesFromPositionsWithVelocitiesKernel;
-	KernelDesc		vSolveLinksKernel;
+	cl_kernel		vSolveLinksKernel;
-	KernelDesc		resetNormalsAndAreasKernel;
+	cl_kernel		resetNormalsAndAreasKernel;
-	KernelDesc		normalizeNormalsAndAreasKernel;
+	cl_kernel		normalizeNormalsAndAreasKernel;
-	KernelDesc		updateSoftBodiesKernel;
+	cl_kernel		updateSoftBodiesKernel;
-	KernelDesc		outputToVertexArrayWithNormalsKernel;
+	cl_kernel		outputToVertexArrayWithNormalsKernel;
-	KernelDesc		outputToVertexArrayWithoutNormalsKernel;
+	cl_kernel		outputToVertexArrayWithoutNormalsKernel;
-	KernelDesc		outputToVertexArrayKernel;
+	cl_kernel		outputToVertexArrayKernel;
-	KernelDesc		applyForcesKernel;
+	cl_kernel		applyForcesKernel;
-	KernelDesc		collideSphereKernel;
+	cl_kernel		collideSphereKernel;
-	KernelDesc		collideCylinderKernel;
+	cl_kernel		collideCylinderKernel;
-	static const int workGroupSize = 128;
+	cl_command_queue	m_cqCommandQue;
-
+	cl_context			m_cxMainContext;
 	cl::CommandQueue m_queue;
 	cl::Context context;
 	cl::Device device;
 	/**
-	 * Compile a compute shader kernel from a string and return the appropriate KernelDesc object.
+	 * Compile a compute shader kernel from a string and return the appropriate cl_kernel object.
 	 */
-	KernelDesc compileCLKernelFromString( const char *shaderString, const char *shaderName );
+	cl_kernel compileCLKernelFromString( const char *shaderString, const char *shaderName );
 	bool buildShaders();
@@ -306,7 +264,7 @@ private:
 	void ApplyClampedForce( float solverdt, const Vectormath::Aos::Vector3 &force, const Vectormath::Aos::Vector3 &vertexVelocity, float inverseMass, Vectormath::Aos::Vector3 &vertexForce );
-	btAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
+	btOpenCLAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
 	virtual void applyForces( float solverdt );
@@ -342,7 +300,7 @@ private:
 public:
-	btOpenCLSoftBodySolver(const cl::CommandQueue &queue);
+	btOpenCLSoftBodySolver(cl_command_queue queue,cl_context	ctx);
 	virtual ~btOpenCLSoftBodySolver();
@@ -371,4 +329,4 @@ public:
 	virtual void copySoftBodyToVertexBuffer( const btSoftBody *const softBody, btVertexBufferDescriptor *vertexBuffer );
 }; // btOpenCLSoftBodySolver
-#endif #ifndef BT_SOFT_BODY_SOLVER_OPENCL_H
+#endif #ifndef BT_SOFT_BODY_SOLVER_OPENCL_H
--- a/src/BulletSoftBody/btSoftBodySolvers.h
+++ b/src/BulletSoftBody/btSoftBodySolvers.h
@@ -23,7 +23,6 @@ class btSoftBodyTriangleData;
 class btSoftBodyLinkData;
 class btSoftBodyVertexData;
 class btVertexBufferDescriptor;
 class btAcceleratedSoftBodyInterface;
 class btCollisionObject;
 class btSoftBody;
--- a/src/MiniCL/MiniCL.cpp
+++ b/src/MiniCL/MiniCL.cpp
@@ -30,6 +30,7 @@ subject to the following restrictions:
 //#define DEBUG_MINICL_KERNELS 1
 static char* spPlatformID = "MiniCL, SCEA";
 static char* spDriverVersion= "1.0";
 CL_API_ENTRY cl_int CL_API_CALL clGetPlatformIDs(
 	cl_uint           num_entries,
@@ -91,23 +92,24 @@ CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(
 	cl_device_info          param_name ,
 	size_t                  param_value_size ,
 	void *                  param_value ,
-	size_t *                /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0
+	size_t *                param_value_size_ret) CL_API_SUFFIX__VERSION_1_0
 {
 	switch (param_name)
 	{
 	case CL_DEVICE_NAME:
 		{
-			char deviceName[] = "CPU";
+			char deviceName[] = "MiniCL CPU";
 			unsigned int nameLen = strlen(deviceName)+1;
 			btAssert(param_value_size>strlen(deviceName));
 			if (nameLen < param_value_size)
 			{
-				const char* cpuName = "CPU";
+				const char* cpuName = "MiniCL CPU";
 				sprintf((char*)param_value,"%s",cpuName);
 			} else
 			{
 				printf("error: param_value_size should be at least %d, but it is %d\n",nameLen,param_value_size);
 				return CL_INVALID_VALUE; 
 			}
 			break;
 		}
@@ -120,6 +122,7 @@ CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(
 			} else
 			{
 				printf("error: param_value_size should be at least %d\n",sizeof(cl_device_type));
 				return CL_INVALID_VALUE; 
 			}
 			break;
 		}
@@ -132,6 +135,7 @@ CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(
 			} else
 			{
 				printf("error: param_value_size should be at least %d\n",sizeof(cl_uint));
 				return CL_INVALID_VALUE; 
 			}
 			break;
@@ -149,6 +153,7 @@ CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(
 			} else
 			{
 				printf("error: param_value_size should be at least %d\n",sizeof(cl_uint));
 				return CL_INVALID_VALUE; 
 			}
 			break;
 		}
@@ -158,6 +163,142 @@ CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(
 			 *clock_frequency = 3*1024;
 			break;
 		}
 	case CL_DEVICE_VENDOR	:
 		{
 			if(param_value_size < (strlen(spPlatformID) + 1))
 			{
 				return CL_INVALID_VALUE; 
 			}
 			strcpy((char*)param_value, spPlatformID);
 			if(param_value_size_ret != NULL)
 			{
 				*param_value_size_ret = strlen(spPlatformID) + 1;
 			}
 			break;
 		}
 	case CL_DRIVER_VERSION:
 		{
 			if(param_value_size < (strlen(spDriverVersion) + 1))
 			{
 				return CL_INVALID_VALUE; 
 			}
 			strcpy((char*)param_value, spDriverVersion);
 			if(param_value_size_ret != NULL)
 			{
 				*param_value_size_ret = strlen(spDriverVersion) + 1;
 			}
 			break;
 		}
 	case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:
 		{
 			 cl_uint* maxDimensions = (cl_uint*)param_value;
 			 *maxDimensions = 1;
 			 break;
 		}
 		case CL_DEVICE_MAX_WORK_GROUP_SIZE:
 		{
 			 cl_uint* maxWorkGroupSize = (cl_uint*)param_value;
 			 *maxWorkGroupSize = 128;//1;
 			 break;
 		}
 		case CL_DEVICE_ADDRESS_BITS:
 		{
 			 cl_uint* addressBits = (cl_uint*)param_value;
 			 *addressBits= 32; //@todo: should this be 64 for 64bit builds?
 			 break;
 		}
 		case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
 			{
 				cl_ulong* maxMemAlloc = (cl_ulong*)param_value;
 				*maxMemAlloc= 512*1024*1024; //this "should be enough for everyone" ?
 			 break;
 			}
 		case CL_DEVICE_GLOBAL_MEM_SIZE:
 			{
 				cl_ulong* maxMemAlloc = (cl_ulong*)param_value;
 				*maxMemAlloc= 1024*1024*1024; //this "should be enough for everyone" ?
 			 break;
 			}
 		case CL_DEVICE_ERROR_CORRECTION_SUPPORT:
 			{
 			cl_bool* error_correction_support = (cl_bool*)param_value;
 			*error_correction_support = CL_FALSE;
 			break;
 			}
 		case CL_DEVICE_LOCAL_MEM_TYPE:
 			{
 			cl_device_local_mem_type* local_mem_type = (cl_device_local_mem_type*)param_value;
 			*local_mem_type = CL_GLOBAL;
 			break;
 			}
 		case CL_DEVICE_LOCAL_MEM_SIZE:
 			{
 				cl_ulong* localmem = (cl_ulong*) param_value;
 				*localmem = 32*1024;
 				break;
 			}
 		case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:
 			{
 				cl_ulong* localmem = (cl_ulong*) param_value;
 				*localmem = 64*1024;
 				break;
 			}
 		case CL_DEVICE_QUEUE_PROPERTIES:
 			{
 				cl_command_queue_properties* queueProp = (cl_command_queue_properties*) param_value;
 				memset(queueProp,0,param_value_size);
 				break;
 			}
 		case CL_DEVICE_IMAGE_SUPPORT:
 			{
 				cl_bool* imageSupport = (cl_bool*) param_value;
 				*imageSupport = CL_FALSE;
 				break;
 			}
 		case CL_DEVICE_MAX_WRITE_IMAGE_ARGS:
 		case CL_DEVICE_MAX_READ_IMAGE_ARGS:
 			{
 				cl_uint* imageArgs = (cl_uint*) param_value;
 				*imageArgs = 0;
 				break;
 			}
 		case CL_DEVICE_IMAGE3D_MAX_DEPTH:
 		case CL_DEVICE_IMAGE3D_MAX_HEIGHT:
 		case CL_DEVICE_IMAGE2D_MAX_HEIGHT:
 		case CL_DEVICE_IMAGE3D_MAX_WIDTH:
 		case CL_DEVICE_IMAGE2D_MAX_WIDTH:
 			{
 				size_t* maxSize = (size_t*) param_value;
 				*maxSize = 0;
 				break;
 			}
 		case CL_DEVICE_EXTENSIONS:
 			{
 				char* extensions = (char*) param_value;
 				*extensions = 0;
 				break;
 			}
 		case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
 		case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:
 		case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:
 		case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:
 		case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
 		case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
 			{
 				cl_uint* width  = (cl_uint*) param_value;
 				*width = 1;
 				break;
 			}
 	default:
 		{
 			printf("error: unsupported param_name:%d\n",param_name);
@@ -486,7 +627,7 @@ extern CL_API_ENTRY cl_int CL_API_CALL clGetContextInfo(cl_context         /* co
 }
 CL_API_ENTRY cl_context CL_API_CALL clCreateContextFromType(cl_context_properties * /* properties */,
-                        cl_device_type          /* device_type */,
+                        cl_device_type           device_type ,
                        void (*pfn_notify)(const char *, const void *, size_t, void *) /* pfn_notify */,
                        void *                  /* user_data */,
                        cl_int *                 errcode_ret ) CL_API_SUFFIX__VERSION_1_0
@@ -502,14 +643,18 @@ CL_API_ENTRY cl_context CL_API_CALL clCreateContextFromType(cl_context_propertie
 		"MiniCL_0", "MiniCL_1", "MiniCL_2", "MiniCL_3", "MiniCL_4", "MiniCL_5", "MiniCL_6", "MiniCL_7" 
 	};
-#ifdef DEBUG_MINICL_KERNELS
+	btThreadSupportInterface* threadSupport = 0;
-	SequentialThreadSupport::SequentialThreadConstructionInfo stc("MiniCL",processMiniCLTask,createMiniCLLocalStoreMemory);
+
-	SequentialThreadSupport* threadSupport = new SequentialThreadSupport(stc);
+	if (device_type==CL_DEVICE_TYPE_DEBUG)
-#else
+	{
 		SequentialThreadSupport::SequentialThreadConstructionInfo stc("MiniCL",processMiniCLTask,createMiniCLLocalStoreMemory);
 		threadSupport = new SequentialThreadSupport(stc);
 	} else
 	{
 #if _WIN32
 	btAssert(sUniqueThreadSupportIndex < maxNumOfThreadSupports);
-	Win32ThreadSupport* threadSupport = new Win32ThreadSupport(Win32ThreadSupport::Win32ThreadConstructionInfo(
+	threadSupport = new Win32ThreadSupport(Win32ThreadSupport::Win32ThreadConstructionInfo(
 //								"MiniCL",
 								sUniqueThreadSupportName[sUniqueThreadSupportIndex++],
 								processMiniCLTask, //processCollisionTask,
@@ -518,10 +663,10 @@ CL_API_ENTRY cl_context CL_API_CALL clCreateContextFromType(cl_context_propertie
 #else
 	///todo: add posix thread support for other platforms
 	SequentialThreadSupport::SequentialThreadConstructionInfo stc("MiniCL",processMiniCLTask,createMiniCLLocalStoreMemory);
-	SequentialThreadSupport* threadSupport = new SequentialThreadSupport(stc);
+	threadSupport = new SequentialThreadSupport(stc);
 #endif
-#endif //DEBUG_MINICL_KERNELS
+	}
 	MiniCLTaskScheduler* scheduler = new MiniCLTaskScheduler(threadSupport,maxNumOutstandingTasks);
--- a/src/MiniCL/cl.h
+++ b/src/MiniCL/cl.h
@@ -155,8 +155,10 @@ typedef struct _cl_image_format {
 #define CL_DEVICE_TYPE_CPU                          (1 << 1)
 #define CL_DEVICE_TYPE_GPU                          (1 << 2)
 #define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
 #define CL_DEVICE_TYPE_DEBUG						(1 << 4)
 #define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
 // cl_device_info
 #define CL_DEVICE_TYPE                              0x1000
 #define CL_DEVICE_VENDOR_ID                         0x1001
--- a/src/MiniCL/cl_MiniCL_Defs.h
+++ b/src/MiniCL/cl_MiniCL_Defs.h
@@ -140,6 +140,8 @@ static float4 operator+(const float4& a,const float4& b)
 	return tmp;
 }
 static float4 operator-(const float4& a,const float4& b)
 {
 	float4 tmp;
@@ -159,6 +161,17 @@ static float4 operator*(float a,const float4& b)
 	return tmp;
 }
 static float4 operator/(const float4& b,float a)
 {
 	float4 tmp;
 	tmp.x = b.x/a;
 	tmp.y = b.y/a;
 	tmp.z = b.z/a;
 	tmp.w = b.w/a;
 	return tmp;
 }
 static float dot(const float4&a ,const float4& b)
 {
@@ -170,6 +183,22 @@ static float dot(const float4&a ,const float4& b)
 	return tmp.x+tmp.y+tmp.z+tmp.w;
 }
 static float length(const float4&a)
 {
 	float l = sqrtf(a.x*a.x+a.y*a.y+a.z*a.z);
 	return l;
 }
 static float4 normalize(const float4&a)
 {
 	float4 tmp;
 	float l = length(a);
 	tmp = 1.f/l*a;
 	return tmp;
 }
 static float4 cross(const float4&a ,const float4& b)
 {
 	float4 tmp;