Synchronize changes from branches/GpuClothAMD to trunk

Main improvements are: GPU cloth collision detection against a capsule shape ,OpenCL-OpenGL interoperability (keeping data buffers on GPU), and bug fixes Thanks to Lee Howes
2011-02-27 09:07:07 +00:00
parent ec1bd45f4f
commit d52f58edd8
37 changed files with 3267 additions and 2481 deletions
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/AMD/CMakeLists.txt
@@ -11,15 +11,21 @@ ADD_DEFINITIONS(-DCL_PLATFORM_AMD)

 SET(BulletSoftBodyOpenCLSolvers_SRCS
 	../btSoftBodySolver_OpenCL.cpp
+	../btSoftBodySolver_OpenCLSIMDAware.cpp
+	../btSoftBodySolverOutputCLtoGL.cpp
 )

 SET(BulletSoftBodyOpenCLSolvers_HDRS
 	../btSoftBodySolver_OpenCL.h
+	../btSoftBodySolver_OpenCLSIMDAware.h
 	../../CPU/btSoftBodySolverData.h
 	../btSoftBodySolverVertexData_OpenCL.h
 	../btSoftBodySolverTriangleData_OpenCL.h
 	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCLSIMDAware.h
 	../btSoftBodySolverBuffer_OpenCL.h
+	../btSoftBodySolverVertexBuffer_OpenGL.h
+	../btSoftBodySolverOutputCLtoGL.h
 )

 # OpenCL and HLSL Shaders.
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/NVidia/CMakeLists.txt
@@ -5,8 +5,11 @@ INCLUDE_DIRECTORIES(
 )


+
 SET(BulletSoftBodyOpenCLSolvers_SRCS
 	../btSoftBodySolver_OpenCL.cpp
+	../btSoftBodySolver_OpenCLSIMDAware.cpp
+	../btSoftBodySolverOutputCLtoGL.cpp
 )

 SET(BulletSoftBodyOpenCLSolvers_HDRS
@@ -15,7 +18,10 @@ SET(BulletSoftBodyOpenCLSolvers_HDRS
 	../btSoftBodySolverVertexData_OpenCL.h
 	../btSoftBodySolverTriangleData_OpenCL.h
 	../btSoftBodySolverLinkData_OpenCL.h
+	../btSoftBodySolverLinkData_OpenCLSIMDAware.h
 	../btSoftBodySolverBuffer_OpenCL.h
+	../btSoftBodySolverVertexBuffer_OpenGL.h
+	../btSoftBodySolverOutputCLtoGL.h
 )

 # OpenCL and HLSL Shaders.
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverBuffer_OpenCL.h
@@ -65,6 +65,9 @@ public:
 			cl_mem_flags flags= m_readOnlyOnGPU ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;

 			size_t size = m_CPUBuffer->size() * sizeof(ElementType);
+			// At a minimum the buffer must exist
+			if( size == 0 )
+				size = sizeof(ElementType);
 			m_buffer = clCreateBuffer(m_clContext, flags, size, 0, &err);
 			if( err != CL_SUCCESS )
 			{
@@ -81,6 +84,7 @@ public:
 	btOpenCLBuffer( cl_command_queue	commandQue,cl_context ctx, btAlignedObjectArray< ElementType >* CPUBuffer, bool readOnly)
 		:m_cqCommandQue(commandQue),
 		m_clContext(ctx),
+		m_buffer(0),
 		m_CPUBuffer(CPUBuffer),
 		m_gpuSize(0),
 		m_onGPU(false),
@@ -91,6 +95,7 @@ public:

 	~btOpenCLBuffer()
 	{
+		clReleaseMemObject(m_buffer);
 	}


@@ -105,6 +110,16 @@ public:
 			m_onGPU = false;
 		}

+		if( !m_allocated && m_CPUBuffer->size() == 0  )
+		{
+			// If it isn't on the GPU and yet there is no data on the CPU side this may cause a problem with some kernels.
+			// We should create *something* on the device side
+			if (!createBuffer()) {
+				return false;
+			}
+			m_allocated = true;
+		}
+
 		if( !m_onGPU && m_CPUBuffer->size() > 0 )
 		{
 			if (!m_allocated || (m_CPUBuffer->size() != m_gpuSize)) {
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.cpp
@@ -20,10 +20,26 @@ subject to the following restrictions:
 #include "btSoftBodySolver_OpenCL.h"
 #include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
 #include "BulletSoftBody/btSoftBody.h"
+#include "BulletCollision/CollisionShapes/btCapsuleShape.h"
+#include "LinearMath/btQuickprof.h"
+
+#ifdef USE_MINICL
+	#include "MiniCL/cl.h"
+#else //USE_MINICL
+	#ifdef __APPLE__
+		#include <OpenCL/OpenCL.h>
+	#else
+		#include <CL/cl.h>
+	#endif //__APPLE__
+#endif//USE_MINICL

 #define BT_DEFAULT_WORKGROUPSIZE 128


+
+#define RELEASE_CL_KERNEL(kernelName) {if( kernelName ){ clReleaseKernel( kernelName ); kernelName = 0; }}
+
+
 //CL_VERSION_1_1 seems broken on NVidia SDK so just disable it

 #if (0)//CL_VERSION_1_1 == 1)
@@ -49,6 +65,10 @@ static char* UpdateNormalsCLString =
 #include "OpenCLC/UpdateNormals.cl"
 static char* VSolveLinksCLString = 
 #include "OpenCLC/VSolveLinks.cl"
+static char* ComputeBoundsCLString = 
+#include "OpenCLC/ComputeBounds.cl"
+static char* SolveCollisionsAndUpdateVelocitiesCLString =
+#include "OpenCLC/SolveCollisionsAndUpdateVelocities.cl"
 #else
 ////OpenCL 1.0 kernels don't use float3
 #define MSTRINGIFY(A) #A
@@ -72,6 +92,10 @@ static char* UpdateNormalsCLString =
 #include "OpenCLC10/UpdateNormals.cl"
 static char* VSolveLinksCLString = 
 #include "OpenCLC10/VSolveLinks.cl"
+static char* ComputeBoundsCLString = 
+#include "OpenCLC10/ComputeBounds.cl"
+static char* SolveCollisionsAndUpdateVelocitiesCLString =
+#include "OpenCLC10/SolveCollisionsAndUpdateVelocities.cl"
 #endif //CL_VERSION_1_1


@@ -583,6 +607,7 @@ btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_contex
 	m_linkData(queue, ctx),
 	m_vertexData(queue, ctx),
 	m_triangleData(queue, ctx),
+	clFunctions(queue, ctx),
 	m_clPerClothAcceleration(queue, ctx, &m_perClothAcceleration, true ),
 	m_clPerClothWindVelocity(queue, ctx, &m_perClothWindVelocity, true ),
 	m_clPerClothDampingFactor(queue,ctx, &m_perClothDampingFactor, true ),
@@ -590,6 +615,11 @@ btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_contex
 	m_clPerClothLiftFactor(queue, ctx,&m_perClothLiftFactor, true ),
 	m_clPerClothDragFactor(queue, ctx,&m_perClothDragFactor, true ),
 	m_clPerClothMediumDensity(queue, ctx,&m_perClothMediumDensity, true ),
+	m_clPerClothCollisionObjects( queue, ctx, &m_perClothCollisionObjects, true ),
+	m_clCollisionObjectDetails( queue, ctx, &m_collisionObjectDetails, true ),
+	m_clPerClothMinBounds( queue, ctx, &m_perClothMinBounds, false ),
+	m_clPerClothMaxBounds( queue, ctx, &m_perClothMaxBounds, false ),
+	m_clPerClothFriction( queue, ctx, &m_perClothFriction, false ),
 	m_cqCommandQue( queue ),
 	m_cxMainContext(ctx),
 	m_defaultWorkGroupSize(BT_DEFAULT_WORKGROUPSIZE)
@@ -600,15 +630,85 @@ btOpenCLSoftBodySolver::btOpenCLSoftBodySolver(cl_command_queue queue, cl_contex
 	m_updateSolverConstants = true;

 	m_shadersInitialized = false;
+
+	prepareLinksKernel = 0;
+	solvePositionsFromLinksKernel = 0;
+	updateConstantsKernel = 0;
+	integrateKernel = 0;
+	addVelocityKernel = 0;
+	updatePositionsFromVelocitiesKernel = 0;
+	updateVelocitiesFromPositionsWithoutVelocitiesKernel = 0;
+	updateVelocitiesFromPositionsWithVelocitiesKernel = 0;
+	vSolveLinksKernel = 0;
+	solveCollisionsAndUpdateVelocitiesKernel = 0;
+	resetNormalsAndAreasKernel = 0;
+	resetNormalsAndAreasKernel = 0;
+	normalizeNormalsAndAreasKernel = 0;
+	computeBoundsKernel = 0;
+	outputToVertexArrayKernel = 0;
+	applyForcesKernel = 0;
 }

 btOpenCLSoftBodySolver::~btOpenCLSoftBodySolver()
 {
+	releaseKernels();
 }

-void btOpenCLSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &softBodies )
+void btOpenCLSoftBodySolver::releaseKernels()
 {
-	if( m_softBodySet.size() != softBodies.size() )
+	RELEASE_CL_KERNEL( prepareLinksKernel );
+	RELEASE_CL_KERNEL( solvePositionsFromLinksKernel );
+	RELEASE_CL_KERNEL( updateConstantsKernel );
+	RELEASE_CL_KERNEL( integrateKernel );
+	RELEASE_CL_KERNEL( addVelocityKernel );
+	RELEASE_CL_KERNEL( updatePositionsFromVelocitiesKernel );
+	RELEASE_CL_KERNEL( updateVelocitiesFromPositionsWithoutVelocitiesKernel );
+	RELEASE_CL_KERNEL( updateVelocitiesFromPositionsWithVelocitiesKernel );
+	RELEASE_CL_KERNEL( vSolveLinksKernel );
+	RELEASE_CL_KERNEL( solveCollisionsAndUpdateVelocitiesKernel );
+	RELEASE_CL_KERNEL( resetNormalsAndAreasKernel );
+	RELEASE_CL_KERNEL( normalizeNormalsAndAreasKernel );
+	RELEASE_CL_KERNEL( computeBoundsKernel );
+	RELEASE_CL_KERNEL( outputToVertexArrayKernel );
+	RELEASE_CL_KERNEL( applyForcesKernel );
+
+	m_shadersInitialized = false;
+}
+
+void btOpenCLSoftBodySolver::copyBackToSoftBodies()
+{
+	// Move the vertex data back to the host first
+	m_vertexData.moveFromAccelerator();
+
+	// Loop over soft bodies, copying all the vertex positions back for each body in turn
+	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
+	{
+		btOpenCLAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[ softBodyIndex ];
+		btSoftBody *softBody = softBodyInterface->getSoftBody();
+
+		int firstVertex = softBodyInterface->getFirstVertex();
+		int numVertices = softBodyInterface->getNumVertices();
+
+		// Copy vertices from solver back into the softbody
+		for( int vertex = 0; vertex < numVertices; ++vertex )
+		{
+			using Vectormath::Aos::Point3;
+			Point3 vertexPosition( getVertexData().getVertexPositions()[firstVertex + vertex] );
+
+			softBody->m_nodes[vertex].m_x.setX( vertexPosition.getX() );
+			softBody->m_nodes[vertex].m_x.setY( vertexPosition.getY() );
+			softBody->m_nodes[vertex].m_x.setZ( vertexPosition.getZ() );
+
+			softBody->m_nodes[vertex].m_n.setX( vertexPosition.getX() );
+			softBody->m_nodes[vertex].m_n.setY( vertexPosition.getY() );
+			softBody->m_nodes[vertex].m_n.setZ( vertexPosition.getZ() );
+		}
+	}
+} // btOpenCLSoftBodySolver::copyBackToSoftBodies
+
+void btOpenCLSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &softBodies, bool forceUpdate )
+{
+	if( forceUpdate || m_softBodySet.size() != softBodies.size() )
 	{
 		// Have a change in the soft body set so update, reloading all the data
 		getVertexData().clear();
@@ -633,6 +733,11 @@ void btOpenCLSoftBodySolver::optimize( btAlignedObjectArray< btSoftBody * > &sof
 			m_perClothLiftFactor.push_back( softBody->m_cfg.kLF );
 			m_perClothDragFactor.push_back( softBody->m_cfg.kDG );
 			m_perClothMediumDensity.push_back(softBody->getWorldInfo()->air_density);
+			// Simple init values. Actually we'll put 0 and -1 into them at the appropriate time
+			m_perClothMinBounds.push_back( UIntVector3(UINT_MAX, UINT_MAX, UINT_MAX) );
+			m_perClothMaxBounds.push_back( UIntVector3(0, 0, 0) );
+			m_perClothFriction.push_back( softBody->getFriction() );
+			m_perClothCollisionObjects.push_back( CollisionObjectIndices(-1, -1) );

 			// Add space for new vertices and triangles in the default solver for now
 			// TODO: Include space here for tearing too later
@@ -738,12 +843,6 @@ btSoftBodyTriangleData &btOpenCLSoftBodySolver::getTriangleData()
 	return m_triangleData;
 }

-
-bool btOpenCLSoftBodySolver::checkInitialized()
-{
-	return buildShaders();
-}
-
 void btOpenCLSoftBodySolver::resetNormalsAndAreas( int numVertices )
 {
 	cl_int ciErrNum;
@@ -751,11 +850,15 @@ void btOpenCLSoftBodySolver::resetNormalsAndAreas( int numVertices )
 	ciErrNum = clSetKernelArg(resetNormalsAndAreasKernel, 1, sizeof(cl_mem), (void*)&m_vertexData.m_clVertexNormal.m_buffer);//oclCHECKERROR(ciErrNum, CL_SUCCESS);
 	ciErrNum = clSetKernelArg(resetNormalsAndAreasKernel,  2, sizeof(cl_mem), (void*)&m_vertexData.m_clVertexArea.m_buffer); //oclCHECKERROR(ciErrNum, CL_SUCCESS);
 	size_t numWorkItems = m_defaultWorkGroupSize*((numVertices + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, resetNormalsAndAreasKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize, 0,0,0 );

-	if( ciErrNum != CL_SUCCESS )
+	if (numWorkItems)
 	{
-		btAssert( 0 && "enqueueNDRangeKernel(resetNormalsAndAreasKernel)" );
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, resetNormalsAndAreasKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize, 0,0,0 );
+
+		if( ciErrNum != CL_SUCCESS )
+		{
+			btAssert( 0 && "enqueueNDRangeKernel(resetNormalsAndAreasKernel)" );
+		}
 	}

 }
@@ -770,10 +873,13 @@ void btOpenCLSoftBodySolver::normalizeNormalsAndAreas( int numVertices )
 	ciErrNum = clSetKernelArg(normalizeNormalsAndAreasKernel, 2, sizeof(cl_mem), &m_vertexData.m_clVertexNormal.m_buffer);
 	ciErrNum = clSetKernelArg(normalizeNormalsAndAreasKernel, 3, sizeof(cl_mem), &m_vertexData.m_clVertexArea.m_buffer);
 	size_t	numWorkItems = m_defaultWorkGroupSize*((numVertices + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, normalizeNormalsAndAreasKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize, 0,0,0);
-	if( ciErrNum != CL_SUCCESS ) 
+	if (numWorkItems)
 	{
-		btAssert( 0 && "enqueueNDRangeKernel(normalizeNormalsAndAreasKernel)");
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, normalizeNormalsAndAreasKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize, 0,0,0);
+		if( ciErrNum != CL_SUCCESS ) 
+		{
+			btAssert( 0 && "enqueueNDRangeKernel(normalizeNormalsAndAreasKernel)");
+		}
 	}

 }
@@ -875,10 +981,13 @@ void btOpenCLSoftBodySolver::applyForces( float solverdt )
 	ciErrNum = clSetKernelArg(applyForcesKernel,12, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);
 	ciErrNum = clSetKernelArg(applyForcesKernel,13, sizeof(cl_mem), &m_vertexData.m_clVertexVelocity.m_buffer);
 	size_t numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,applyForcesKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize, 0,0,0);
-	if( ciErrNum != CL_SUCCESS ) 
+	if (numWorkItems)
 	{
-		btAssert( 0 &&  "enqueueNDRangeKernel(applyForcesKernel)");
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,applyForcesKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize, 0,0,0);
+		if( ciErrNum != CL_SUCCESS ) 
+		{
+			btAssert( 0 &&  "enqueueNDRangeKernel(applyForcesKernel)");
+		}
 	}

 }
@@ -904,10 +1013,13 @@ void btOpenCLSoftBodySolver::integrate( float solverdt )
 	ciErrNum = clSetKernelArg(integrateKernel, 6, sizeof(cl_mem), &m_vertexData.m_clVertexForceAccumulator.m_buffer);

 	size_t numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
-	ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,integrateKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize,0,0,0);
-	if( ciErrNum != CL_SUCCESS )
+	if (numWorkItems)
 	{
-		btAssert( 0 &&  "enqueueNDRangeKernel(integrateKernel)");
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,integrateKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize,0,0,0);
+		if( ciErrNum != CL_SUCCESS )
+		{
+			btAssert( 0 &&  "enqueueNDRangeKernel(integrateKernel)");
+		}
 	}

 }
@@ -924,6 +1036,102 @@ float btOpenCLSoftBodySolver::computeTriangleArea(
 	return area;
 }

+
+void btOpenCLSoftBodySolver::updateBounds()
+{	
+
+//#define	USE_GPU_BOUNDS_COMPUTATION
+#ifdef USE_GPU_BOUNDS_COMPUTATION
+	using Vectormath::Aos::Point3;
+	// Interpretation structure for float and int
+	
+	struct FPRep {
+		unsigned int mantissa  : 23;
+		unsigned int exponent : 8;
+		unsigned int sign    : 1;
+	};
+	union FloatAsInt
+	{
+		float floatValue;
+		int intValue;
+		unsigned int uintValue;
+		FPRep fpRep;
+	};
+
+	
+	// Update bounds array to min and max int values to allow easy atomics
+	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
+	{
+		m_perClothMinBounds[softBodyIndex] = UIntVector3( UINT_MAX, UINT_MAX, UINT_MAX );
+		m_perClothMaxBounds[softBodyIndex] = UIntVector3( 0, 0, 0 );
+	}
+	
+	m_vertexData.moveToAccelerator();
+	m_clPerClothMinBounds.moveToGPU();
+	m_clPerClothMaxBounds.moveToGPU();
+
+
+	computeBounds( );
+
+
+	m_clPerClothMinBounds.moveFromGPU();
+	m_clPerClothMaxBounds.moveFromGPU();
+
+
+	
+	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
+	{
+		UIntVector3 minBoundUInt = m_perClothMinBounds[softBodyIndex];
+		UIntVector3 maxBoundUInt = m_perClothMaxBounds[softBodyIndex];
+		/*UIntVector3 minBoundUInt;
+		minBoundUInt.x = m_perClothMinBounds[softBodyIndex*4];
+		minBoundUInt.y = m_perClothMinBounds[softBodyIndex*4+1];	
+		minBoundUInt.z = m_perClothMinBounds[softBodyIndex*4+2];
+		UIntVector3 maxBoundUInt;
+		maxBoundUInt.x = m_perClothMaxBounds[softBodyIndex*4];
+		maxBoundUInt.y = m_perClothMaxBounds[softBodyIndex*4+1];
+		maxBoundUInt.z = m_perClothMaxBounds[softBodyIndex*4+2];*/
+				
+		// Convert back to float
+		FloatAsInt fai;
+
+		btVector3 minBound;
+		fai.uintValue = minBoundUInt.x;
+	    fai.uintValue ^= (((fai.uintValue >> 31) - 1) | 0x80000000);
+		minBound.setX( fai.floatValue );
+		fai.uintValue = minBoundUInt.y;
+		fai.uintValue ^= (((fai.uintValue >> 31) - 1) | 0x80000000);
+		minBound.setY( fai.floatValue );
+		fai.uintValue = minBoundUInt.z;
+		fai.uintValue ^= (((fai.uintValue >> 31) - 1) | 0x80000000);
+		minBound.setZ( fai.floatValue );
+
+		btVector3 maxBound;
+		fai.uintValue = maxBoundUInt.x; 
+		fai.uintValue ^= (((fai.uintValue >> 31) - 1) | 0x80000000);
+		maxBound.setX( fai.floatValue );
+		fai.uintValue = maxBoundUInt.y;
+		fai.uintValue ^= (((fai.uintValue >> 31) - 1) | 0x80000000);
+		maxBound.setY( fai.floatValue );
+		fai.uintValue = maxBoundUInt.z;
+		fai.uintValue ^= (((fai.uintValue >> 31) - 1) | 0x80000000);
+		maxBound.setZ( fai.floatValue );
+
+		
+		// And finally assign to the soft body
+		m_softBodySet[softBodyIndex]->updateBounds( minBound, maxBound );
+	}
+#else	
+	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
+	{
+		btVector3 minBound(-1e30,-1e30,-1e30), maxBound(1e30,1e30,1e30);
+		m_softBodySet[softBodyIndex]->updateBounds( minBound, maxBound );
+	}
+#endif//USE_GPU_BOUNDS_COMPUTATION
+
+} // btOpenCLSoftBodySolver::updateBounds
+
+
 void btOpenCLSoftBodySolver::updateConstants( float timeStep )
 {			

@@ -954,6 +1162,66 @@ void btOpenCLSoftBodySolver::updateConstants( float timeStep )

 }

+class QuickSortCompare
+{
+	public:
+
+	bool operator() ( const CollisionShapeDescription& a, const CollisionShapeDescription& b )
+	{
+		return ( a.softBodyIdentifier < b.softBodyIdentifier );
+	}
+};
+
+
+/**
+ * Sort the collision object details array and generate indexing into it for the per-cloth collision object array.
+ */
+void btOpenCLSoftBodySolver::prepareCollisionConstraints()
+{
+	// First do a simple sort on the collision objects
+	btAlignedObjectArray<int> numObjectsPerClothPrefixSum;
+	btAlignedObjectArray<int> numObjectsPerCloth;
+	numObjectsPerCloth.resize( m_softBodySet.size(), 0 );
+	numObjectsPerClothPrefixSum.resize( m_softBodySet.size(), 0 );
+
+
+	
+	m_collisionObjectDetails.quickSort( QuickSortCompare() );
+
+	if (!m_perClothCollisionObjects.size())
+		return;
+
+	// Generating indexing for perClothCollisionObjects
+	// First clear the previous values with the "no collision object for cloth" constant
+	for( int clothIndex = 0; clothIndex < m_perClothCollisionObjects.size(); ++clothIndex )
+	{
+		m_perClothCollisionObjects[clothIndex].firstObject = -1;
+		m_perClothCollisionObjects[clothIndex].endObject = -1;
+	}
+	int currentCloth = 0;
+	int startIndex = 0;
+	for( int collisionObject = 0; collisionObject < m_collisionObjectDetails.size(); ++collisionObject )
+	{
+		int nextCloth = m_collisionObjectDetails[collisionObject].softBodyIdentifier;
+		if( nextCloth != currentCloth )
+		{	
+			// Changed cloth in the array
+			// Set the end index and the range is what we need for currentCloth
+			m_perClothCollisionObjects[currentCloth].firstObject = startIndex;
+			m_perClothCollisionObjects[currentCloth].endObject = collisionObject;
+			currentCloth = nextCloth;
+			startIndex = collisionObject;
+		}
+	}
+
+	// And update last cloth	
+	m_perClothCollisionObjects[currentCloth].firstObject = startIndex;
+	m_perClothCollisionObjects[currentCloth].endObject =  m_collisionObjectDetails.size();
+	
+} // btOpenCLSoftBodySolver::prepareCollisionConstraints
+
+
+
 void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 {

@@ -993,6 +1261,9 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 		}
 	}

+	
+	prepareCollisionConstraints();
+
 	// Compute new positions from velocity
 	// Also update the previous position so that our position computation is now based on the new position from the velocity solution
 	// rather than based directly on the original positions
@@ -1016,8 +1287,9 @@ void btOpenCLSoftBodySolver::solveConstraints( float solverdt )
 		
 	} // for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )

-
-	updateVelocitiesFromPositionsWithoutVelocities( 1.f/solverdt );
+	
+	// At this point assume that the force array is blank - we will overwrite it
+	solveCollisionsAndUpdateVelocities( 1.f/solverdt );

 }

@@ -1158,19 +1430,88 @@ void btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities( flo

 } // updateVelocitiesFromPositionsWithoutVelocities

+
+void btOpenCLSoftBodySolver::computeBounds( )
+{	
+	m_vertexData.moveToAccelerator();
+
+	cl_int ciErrNum;
+	int numVerts = m_vertexData.getNumVertices();
+	int numSoftBodies = m_softBodySet.size();
+	ciErrNum = clSetKernelArg(computeBoundsKernel, 0, sizeof(int), &numVerts);
+	ciErrNum = clSetKernelArg(computeBoundsKernel, 1, sizeof(int), &numSoftBodies);
+	ciErrNum = clSetKernelArg(computeBoundsKernel, 2, sizeof(cl_mem),&m_vertexData.m_clClothIdentifier.m_buffer);
+	ciErrNum = clSetKernelArg(computeBoundsKernel, 3, sizeof(cl_mem),&m_vertexData.m_clVertexPosition.m_buffer);
+	ciErrNum = clSetKernelArg(computeBoundsKernel, 4, sizeof(cl_mem),&m_clPerClothMinBounds.m_buffer);
+	ciErrNum = clSetKernelArg(computeBoundsKernel, 5, sizeof(cl_mem),&m_clPerClothMaxBounds.m_buffer);
+	ciErrNum = clSetKernelArg(computeBoundsKernel, 6, sizeof(cl_uint4)*256,0);
+	ciErrNum = clSetKernelArg(computeBoundsKernel, 7, sizeof(cl_uint4)*256,0);
+
+	size_t	numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
+	if (numWorkItems)
+	{
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,computeBoundsKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize,0,0,0);
+		if( ciErrNum != CL_SUCCESS ) 
+		{
+			btAssert( 0 &&  "enqueueNDRangeKernel(computeBoundsKernel)");
+		}
+	}
+	clFinish(m_cqCommandQue);
+} // btOpenCLSoftBodySolver::computeBounds
+
+void btOpenCLSoftBodySolver::solveCollisionsAndUpdateVelocities( float isolverdt )
+{
+
+	// Copy kernel parameters to GPU
+	m_vertexData.moveToAccelerator();
+	m_clPerClothFriction.moveToGPU();
+	m_clPerClothDampingFactor.moveToGPU();
+	m_clPerClothCollisionObjects.moveToGPU();
+	m_clCollisionObjectDetails.moveToGPU();
+
+
+	cl_int ciErrNum;
+	int numVerts = m_vertexData.getNumVertices();
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 0, sizeof(int), &numVerts);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 1, sizeof(int), &isolverdt);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 2, sizeof(cl_mem),&m_vertexData.m_clClothIdentifier.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 3, sizeof(cl_mem),&m_vertexData.m_clVertexPreviousPosition.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 4, sizeof(cl_mem),&m_clPerClothFriction.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 5, sizeof(cl_mem),&m_clPerClothDampingFactor.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 6, sizeof(cl_mem),&m_clPerClothCollisionObjects.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 7, sizeof(cl_mem),&m_clCollisionObjectDetails.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 8, sizeof(cl_mem),&m_vertexData.m_clVertexForceAccumulator.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 9, sizeof(cl_mem),&m_vertexData.m_clVertexVelocity.m_buffer);
+	ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 10, sizeof(cl_mem),&m_vertexData.m_clVertexPosition.m_buffer);
+
+	size_t	numWorkItems = m_defaultWorkGroupSize*((m_vertexData.getNumVertices() + (m_defaultWorkGroupSize-1)) / m_defaultWorkGroupSize);
+	if (numWorkItems)
+	{
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,solveCollisionsAndUpdateVelocitiesKernel, 1, NULL, &numWorkItems, &m_defaultWorkGroupSize,0,0,0);
+		if( ciErrNum != CL_SUCCESS ) 
+		{
+			btAssert( 0 &&  "enqueueNDRangeKernel(updateVelocitiesFromPositionsWithoutVelocitiesKernel)");
+		}
+	}
+
+} // btOpenCLSoftBodySolver::updateVelocitiesFromPositionsWithoutVelocities
+
+
+
 // End kernel dispatches
 /////////////////////////////////////


-void btOpenCLSoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer )
+void btSoftBodySolverOutputCLtoCPU::copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer )
 {
-	// Currently only support CPU output buffers
-	// TODO: check for DX11 buffers. Take all offsets into the same DX11 buffer
-	// and use them together on a single kernel call if possible by setting up a
-	// per-cloth target buffer array for the copy kernel.

+	btSoftBodySolver *solver = softBody->getSoftBodySolver();
+	btAssert( solver->getSolverType() == btSoftBodySolver::CL_SOLVER || solver->getSolverType() == btSoftBodySolver::CL_SIMD_SOLVER );
+	btOpenCLSoftBodySolver *dxSolver = static_cast< btOpenCLSoftBodySolver * >( solver );

-	btOpenCLAcceleratedSoftBodyInterface *currentCloth = findSoftBodyInterface( softBody );
+	btOpenCLAcceleratedSoftBodyInterface* currentCloth = dxSolver->findSoftBodyInterface( softBody );
+	btSoftBodyVertexDataOpenCL &vertexData( dxSolver->m_vertexData );
+	

 	const int firstVertex = currentCloth->getFirstVertex();
 	const int lastVertex = firstVertex + currentCloth->getNumVertices();
@@ -1180,8 +1521,8 @@ void btOpenCLSoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * cons
 		const btCPUVertexBufferDescriptor *cpuVertexBuffer = static_cast< btCPUVertexBufferDescriptor* >(vertexBuffer);						
 		float *basePointer = cpuVertexBuffer->getBasePointer();						

-		m_vertexData.m_clVertexPosition.copyFromGPU();
-		m_vertexData.m_clVertexNormal.copyFromGPU();
+		vertexData.m_clVertexPosition.copyFromGPU();
+		vertexData.m_clVertexNormal.copyFromGPU();

 		if( vertexBuffer->hasVertexPositions() )
 		{
@@ -1191,7 +1532,7 @@ void btOpenCLSoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * cons

 			for( int vertexIndex = firstVertex; vertexIndex < lastVertex; ++vertexIndex )
 			{
-				Vectormath::Aos::Point3 position = m_vertexData.getPosition(vertexIndex);
+				Vectormath::Aos::Point3 position = vertexData.getPosition(vertexIndex);
 				*(vertexPointer + 0) = position.getX();
 				*(vertexPointer + 1) = position.getY();
 				*(vertexPointer + 2) = position.getZ();
@@ -1206,7 +1547,7 @@ void btOpenCLSoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * cons

 			for( int vertexIndex = firstVertex; vertexIndex < lastVertex; ++vertexIndex )
 			{
-				Vectormath::Aos::Vector3 normal = m_vertexData.getNormal(vertexIndex);
+				Vectormath::Aos::Vector3 normal = vertexData.getNormal(vertexIndex);
 				*(normalPointer + 0) = normal.getX();
 				*(normalPointer + 1) = normal.getY();
 				*(normalPointer + 2) = normal.getZ();
@@ -1215,10 +1556,11 @@ void btOpenCLSoftBodySolver::copySoftBodyToVertexBuffer( const btSoftBody * cons
 		}
 	}

-} // btCPUSoftBodySolver::outputToVertexBuffers
+} // btSoftBodySolverOutputCLtoCPU::outputToVertexBuffers


-cl_kernel btOpenCLSoftBodySolver::compileCLKernelFromString( const char* kernelSource, const char* kernelName )
+
+cl_kernel CLFunctions::compileCLKernelFromString( const char* kernelSource, const char* kernelName, const char* additionalMacros )
 {
 	printf("compiling kernelName: %s ",kernelName);
 	cl_kernel kernel;
@@ -1229,19 +1571,45 @@ cl_kernel btOpenCLSoftBodySolver::compileCLKernelFromString( const char* kernelS
 //	oclCHECKERROR(ciErrNum, CL_SUCCESS);
 		
    // Build the program with 'mad' Optimization option
+
+	
 #ifdef MAC
 	char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
 #else
-	const char* flags = "-DGUID_ARG=";
+	//const char* flags = "-DGUID_ARG= -fno-alias";
+	const char* flags = "-DGUID_ARG= ";
 #endif
-    ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, flags, NULL, NULL);
+
+	char* compileFlags = new char[strlen(additionalMacros) + strlen(flags) + 5];
+	sprintf(compileFlags, "%s %s", flags, additionalMacros);
+    ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, compileFlags, NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
-        printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
+		size_t numDevices;
+		clGetProgramInfo( m_cpProgram, CL_PROGRAM_DEVICES, 0, 0, &numDevices );
+		cl_device_id *devices = new cl_device_id[numDevices];
+		clGetProgramInfo( m_cpProgram, CL_PROGRAM_DEVICES, numDevices, devices, &numDevices );
+        for( int i = 0; i < 2; ++i )
+		{
+			char *build_log;
+			size_t ret_val_size;
+			clGetProgramBuildInfo(m_cpProgram, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+			build_log = new char[ret_val_size+1];
+			clGetProgramBuildInfo(m_cpProgram, devices[i], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+    
+			// to be carefully, terminate with \0
+			// there's no information in the reference whether the string is 0 terminated or not
+			build_log[ret_val_size] = '\0';
+        
+
+			printf("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log);
+			delete[] build_log;
+		}
 		btAssert(0);
        exit(0);
    }
 	
+	
    // Create the kernel
    kernel = clCreateKernel(m_cpProgram, kernelName, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
@@ -1252,37 +1620,123 @@ cl_kernel btOpenCLSoftBodySolver::compileCLKernelFromString( const char* kernelS
    }

 	printf("ready. \n");
+	delete [] compileFlags;
 	return kernel;

 }

 void btOpenCLSoftBodySolver::predictMotion( float timeStep )
 {
-	// Fill the force arrays with current acceleration data etc
-	m_perClothWindVelocity.resize( m_softBodySet.size() );
-	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
+	// Clear the collision shape array for the next frame
+	// Ensure that the DX11 ones are moved off the device so they will be updated correctly
+	m_clCollisionObjectDetails.changedOnCPU();
+	m_clPerClothCollisionObjects.changedOnCPU();
+	m_collisionObjectDetails.clear();
+	
 	{
-		btSoftBody *softBody = m_softBodySet[softBodyIndex]->getSoftBody();
-		
-		m_perClothWindVelocity[softBodyIndex] = toVector3(softBody->getWindVelocity());
+		BT_PROFILE("perClothWindVelocity");
+		// Fill the force arrays with current acceleration data etc
+		m_perClothWindVelocity.resize( m_softBodySet.size() );
+		for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
+		{
+			btSoftBody *softBody = m_softBodySet[softBodyIndex]->getSoftBody();
+			
+			m_perClothWindVelocity[softBodyIndex] = toVector3(softBody->getWindVelocity());
+		}
+	}
+	{
+		BT_PROFILE("changedOnCPU");
+		m_clPerClothWindVelocity.changedOnCPU();
 	}
-	m_clPerClothWindVelocity.changedOnCPU();

-	// Apply forces that we know about to the cloths
-	applyForces(  timeStep * getTimeScale() );
+	{
+		BT_PROFILE("applyForces");
+		// Apply forces that we know about to the cloths
+		applyForces(  timeStep * getTimeScale() );
+	}

-	// Itegrate motion for all soft bodies dealt with by the solver
-	integrate( timeStep * getTimeScale() );
+	{
+		BT_PROFILE("integrate");
+		// Itegrate motion for all soft bodies dealt with by the solver
+		integrate( timeStep * getTimeScale() );
+	}
+
+	{
+		BT_PROFILE("updateBounds");
+		updateBounds();
+	}
 	// End prediction work for solvers
 }

+static Vectormath::Aos::Transform3 toTransform3( const btTransform &transform )
+{
+	Vectormath::Aos::Transform3 outTransform;
+	outTransform.setCol(0, toVector3(transform.getBasis().getColumn(0)));
+	outTransform.setCol(1, toVector3(transform.getBasis().getColumn(1)));
+	outTransform.setCol(2, toVector3(transform.getBasis().getColumn(2)));
+	outTransform.setCol(3, toVector3(transform.getOrigin()));
+	return outTransform;	
+}
+
+void btOpenCLAcceleratedSoftBodyInterface::updateBounds( const btVector3 &lowerBound, const btVector3 &upperBound )
+{
+	float scalarMargin = this->getSoftBody()->getCollisionShape()->getMargin();
+	btVector3 vectorMargin( scalarMargin, scalarMargin, scalarMargin );
+	m_softBody->m_bounds[0] = lowerBound - vectorMargin;
+	m_softBody->m_bounds[1] = upperBound + vectorMargin;
+}  // btOpenCLSoftBodySolver::btDX11AcceleratedSoftBodyInterface::updateBounds
+
+void btOpenCLSoftBodySolver::processCollision( btSoftBody*, btSoftBody* )
+{
+
+}
+
+// Add the collision object to the set to deal with for a particular soft body
+void btOpenCLSoftBodySolver::processCollision( btSoftBody *softBody, btCollisionObject* collisionObject )
+{
+ 	int softBodyIndex = findSoftBodyIndex( softBody );
+
+	if( softBodyIndex >= 0 )
+	{
+		btCollisionShape *collisionShape = collisionObject->getCollisionShape();
+		float friction = collisionObject->getFriction();
+		int shapeType = collisionShape->getShapeType();
+		if( shapeType == CAPSULE_SHAPE_PROXYTYPE )
+		{
+			// Add to the list of expected collision objects
+			CollisionShapeDescription newCollisionShapeDescription;
+			newCollisionShapeDescription.softBodyIdentifier = softBodyIndex;
+			newCollisionShapeDescription.collisionShapeType = shapeType;
+			// TODO: May need to transpose this matrix either here or in HLSL
+			newCollisionShapeDescription.shapeTransform = toTransform3(collisionObject->getWorldTransform());
+			btCapsuleShape *capsule = static_cast<btCapsuleShape*>( collisionShape );
+			newCollisionShapeDescription.radius = capsule->getRadius();
+			newCollisionShapeDescription.halfHeight = capsule->getHalfHeight();
+			newCollisionShapeDescription.margin = capsule->getMargin();
+			newCollisionShapeDescription.upAxis = capsule->getUpAxis();
+			newCollisionShapeDescription.friction = friction;
+			btRigidBody* body = static_cast< btRigidBody* >( collisionObject );
+			newCollisionShapeDescription.linearVelocity = toVector3(body->getLinearVelocity());
+			newCollisionShapeDescription.angularVelocity = toVector3(body->getAngularVelocity());
+			m_collisionObjectDetails.push_back( newCollisionShapeDescription );
+
+		} else {
+			btAssert("Unsupported collision shape type\n");
+		}
+	} else {
+		btAssert("Unknown soft body");
+	}
+} // btOpenCLSoftBodySolver::processCollision


-btOpenCLAcceleratedSoftBodyInterface *btOpenCLSoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
+
+
+
+btOpenCLAcceleratedSoftBodyInterface* btOpenCLSoftBodySolver::findSoftBodyInterface( const btSoftBody* const softBody )
 {
 	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
 	{
-		btOpenCLAcceleratedSoftBodyInterface *softBodyInterface = m_softBodySet[softBodyIndex];
+		btOpenCLAcceleratedSoftBodyInterface* softBodyInterface = m_softBodySet[softBodyIndex];
 		if( softBodyInterface->getSoftBody() == softBody )
 			return softBodyInterface;
 	}
@@ -1290,27 +1744,50 @@ btOpenCLAcceleratedSoftBodyInterface *btOpenCLSoftBodySolver::findSoftBodyInterf
 }


+int btOpenCLSoftBodySolver::findSoftBodyIndex( const btSoftBody* const softBody )
+{
+	for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )
+	{
+		btOpenCLAcceleratedSoftBodyInterface* softBodyInterface = m_softBodySet[softBodyIndex];
+		if( softBodyInterface->getSoftBody() == softBody )
+			return softBodyIndex;
+	}
+	return 1;
+}
+
+bool btOpenCLSoftBodySolver::checkInitialized()
+{
+	if( !m_shadersInitialized )
+		if( buildShaders() )
+			m_shadersInitialized = true;
+
+	return m_shadersInitialized;
+}
+
 bool btOpenCLSoftBodySolver::buildShaders()
 {
+	// Ensure current kernels are released first
+	releaseKernels();
+
 	bool returnVal = true;

 	if( m_shadersInitialized )
 		return true;
 	
-	prepareLinksKernel = compileCLKernelFromString( PrepareLinksCLString, "PrepareLinksKernel" );
-	updatePositionsFromVelocitiesKernel = compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel" );
-	solvePositionsFromLinksKernel = compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel" );
-	updateVelocitiesFromPositionsWithVelocitiesKernel = compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel" );
-	updateVelocitiesFromPositionsWithoutVelocitiesKernel = compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel" );
-	integrateKernel = compileCLKernelFromString( IntegrateCLString, "IntegrateKernel" );
-	applyForcesKernel = compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel" );
+	prepareLinksKernel = clFunctions.compileCLKernelFromString( PrepareLinksCLString, "PrepareLinksKernel" );
+	updatePositionsFromVelocitiesKernel = clFunctions.compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel" );
+	solvePositionsFromLinksKernel = clFunctions.compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel" );
+	updateVelocitiesFromPositionsWithVelocitiesKernel = clFunctions.compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel" );
+	updateVelocitiesFromPositionsWithoutVelocitiesKernel = clFunctions.compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel" );
+	computeBoundsKernel = clFunctions.compileCLKernelFromString( ComputeBoundsCLString, "ComputeBoundsKernel" );
+	solveCollisionsAndUpdateVelocitiesKernel = clFunctions.compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel" );
+	integrateKernel = clFunctions.compileCLKernelFromString( IntegrateCLString, "IntegrateKernel" );
+	applyForcesKernel = clFunctions.compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel" );

 	// TODO: Rename to UpdateSoftBodies
-	resetNormalsAndAreasKernel = compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel" );
-	normalizeNormalsAndAreasKernel = compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel" );
-	updateSoftBodiesKernel = compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel" );
-	//outputToVertexArrayWithNormalsKernel = compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithNormalsKernel" );
-	//outputToVertexArrayWithoutNormalsKernel = compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithoutNormalsKernel" );
+	resetNormalsAndAreasKernel = clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel" );
+	normalizeNormalsAndAreasKernel = clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel" );
+	updateSoftBodiesKernel = clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel" );


 	if( returnVal )
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCL.h
@@ -25,12 +25,60 @@ subject to the following restrictions:
 #include "btSoftBodySolverVertexData_OpenCL.h"
 #include "btSoftBodySolverTriangleData_OpenCL.h"

+class CLFunctions
+{
+protected:
+	cl_command_queue	m_cqCommandQue;
+	cl_context			m_cxMainContext;
+	
+public:
+	CLFunctions(cl_command_queue cqCommandQue, cl_context cxMainContext) :
+		m_cqCommandQue( cqCommandQue ),
+		m_cxMainContext( cxMainContext )
+	{
+	}
+
+
+	/**
+	 * Compile a compute shader kernel from a string and return the appropriate cl_kernel object.
+	 */	
+	cl_kernel compileCLKernelFromString( const char* kernelSource, const char* kernelName, const char* additionalMacros = "" );
+};

 /**
- * SoftBody class to maintain information about a soft body instance
- * within a solver.
- * This data addresses the main solver arrays.
+ * Entry in the collision shape array.
+ * Specifies the shape type, the transform matrix and the necessary details of the collisionShape.
 */
+struct CollisionShapeDescription
+{
+	Vectormath::Aos::Transform3 shapeTransform;
+	Vectormath::Aos::Vector3 linearVelocity;
+	Vectormath::Aos::Vector3 angularVelocity;
+
+	int softBodyIdentifier;
+	int collisionShapeType;
+
+	// Both needed for capsule
+	float radius;
+	float halfHeight;
+	int upAxis;
+	
+	float margin;
+	float friction;
+
+	CollisionShapeDescription()
+	{
+		collisionShapeType = 0;
+		margin = 0;
+		friction = 0;
+	}
+};
+
+/**
+	 * SoftBody class to maintain information about a soft body instance
+	 * within a solver.
+	 * This data addresses the main solver arrays.
+	 */
 class btOpenCLAcceleratedSoftBodyInterface
 {
 protected:
@@ -100,6 +148,11 @@ public:
 	{
 		return m_firstTriangle;
 	}
+	
+	/**
+	 * Update the bounds in the btSoftBody object
+	 */
+	void updateBounds( const btVector3 &lowerBound, const btVector3 &upperBound );

 	// TODO: All of these set functions will have to do checks and
 	// update the world because restructuring of the arrays will be necessary
@@ -108,7 +161,7 @@ public:
 	{
 		m_numVertices = numVertices;
 	}	
-	
+
 	void setNumTriangles( int numTriangles )
 	{
 		m_numTriangles = numTriangles;
@@ -172,20 +225,61 @@ public:
 };


+
 class btOpenCLSoftBodySolver : public btSoftBodySolver
 {
-private:
+public:
+	
+
+	struct UIntVector3
+	{
+		UIntVector3()
+		{
+			x = 0;
+			y = 0;
+			z = 0;
+			_padding = 0;
+		}
+		
+		UIntVector3( unsigned int x_, unsigned int y_, unsigned int z_ )
+		{
+			x = x_;
+			y = y_;
+			z = z_;
+			_padding = 0;
+		}
+			
+		unsigned int x;
+		unsigned int y;
+		unsigned int z;
+		unsigned int _padding;
+	};
+
+	struct CollisionObjectIndices
+	{
+		CollisionObjectIndices( int f, int e )
+		{
+			firstObject = f;
+			endObject = e;
+		}
+
+		int firstObject;
+		int endObject;
+	};

 	btSoftBodyLinkDataOpenCL m_linkData;
 	btSoftBodyVertexDataOpenCL m_vertexData;
 	btSoftBodyTriangleDataOpenCL m_triangleData;

+protected:
+
+	CLFunctions clFunctions;
+
 	/** Variable to define whether we need to update solver constants on the next iteration */
 	bool m_updateSolverConstants;

 	bool m_shadersInitialized;

-
 	/** 
 	 * Cloths owned by this solver.
 	 * Only our cloths are in this array.
@@ -224,6 +318,46 @@ private:
 	btAlignedObjectArray< float >						m_perClothMediumDensity;
 	btOpenCLBuffer<float>								m_clPerClothMediumDensity;

+	/** 
+	 * Collision shape details: pair of index of first collision shape for the cloth and number of collision objects.
+	 */
+	btAlignedObjectArray< CollisionObjectIndices >		m_perClothCollisionObjects;
+	btOpenCLBuffer<CollisionObjectIndices>				m_clPerClothCollisionObjects;
+
+	/** 
+	 * Collision shapes being passed across to the cloths in this solver.
+	 */
+	btAlignedObjectArray< CollisionShapeDescription >	m_collisionObjectDetails;
+	btOpenCLBuffer< CollisionShapeDescription >			m_clCollisionObjectDetails;
+
+	/** 
+	 * Minimum bounds for each cloth.
+	 * Updated by GPU and returned for use by broad phase.
+	 * These are int vectors as a reminder that they store the int representation of a float, not a float.
+	 * Bit 31 is inverted - is floats are stored with int-sortable values.
+	 * This is really a uint4 array but thanks to a limitation of OpenCL atomics we are using uints.
+	 */
+	btAlignedObjectArray< UIntVector3 >		m_perClothMinBounds;
+	btOpenCLBuffer< UIntVector3 >			m_clPerClothMinBounds;
+
+	/** 
+	 * Maximum bounds for each cloth.
+	 * Updated by GPU and returned for use by broad phase.
+	 * These are int vectors as a reminder that they store the int representation of a float, not a float.
+	 * Bit 31 is inverted - is floats are stored with int-sortable values.
+	 */
+	btAlignedObjectArray< UIntVector3 >		m_perClothMaxBounds;
+	btOpenCLBuffer< UIntVector3 >			m_clPerClothMaxBounds;
+
+	
+	/** 
+	 * Friction coefficient for each cloth
+	 */
+	btAlignedObjectArray< float >	m_perClothFriction;
+	btOpenCLBuffer< float >			m_clPerClothFriction;
+
+
+
 	cl_kernel		prepareLinksKernel;
 	cl_kernel		solvePositionsFromLinksKernel;
 	cl_kernel		updateConstantsKernel;
@@ -233,41 +367,37 @@ private:
 	cl_kernel		updateVelocitiesFromPositionsWithoutVelocitiesKernel;
 	cl_kernel		updateVelocitiesFromPositionsWithVelocitiesKernel;
 	cl_kernel		vSolveLinksKernel;
+	cl_kernel		solveCollisionsAndUpdateVelocitiesKernel;
 	cl_kernel		resetNormalsAndAreasKernel;
 	cl_kernel		normalizeNormalsAndAreasKernel;
+	cl_kernel		computeBoundsKernel;
 	cl_kernel		updateSoftBodiesKernel;
-	cl_kernel		outputToVertexArrayWithNormalsKernel;
-	cl_kernel		outputToVertexArrayWithoutNormalsKernel;

 	cl_kernel		outputToVertexArrayKernel;
 	cl_kernel		applyForcesKernel;
-	cl_kernel		collideSphereKernel;
-	cl_kernel		collideCylinderKernel;

 	cl_command_queue	m_cqCommandQue;
 	cl_context			m_cxMainContext;
-
+	
 	size_t				m_defaultWorkGroupSize;


-	/**
-	 * Compile a compute shader kernel from a string and return the appropriate cl_kernel object.
-	 */
-	cl_kernel compileCLKernelFromString( const char *shaderString, const char *shaderName );
-
-	bool buildShaders();
+	virtual bool buildShaders();

 	void resetNormalsAndAreas( int numVertices );

 	void normalizeNormalsAndAreas( int numVertices );

 	void executeUpdateSoftBodies( int firstTriangle, int numTriangles );
+
+	void prepareCollisionConstraints();
 	
 	Vectormath::Aos::Vector3 ProjectOnAxis( const Vectormath::Aos::Vector3 &v, const Vectormath::Aos::Vector3 &a );

 	void ApplyClampedForce( float solverdt, const Vectormath::Aos::Vector3 &force, const Vectormath::Aos::Vector3 &vertexVelocity, float inverseMass, Vectormath::Aos::Vector3 &vertexForce );
 	
-	btOpenCLAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );
+
+	int findSoftBodyIndex( const btSoftBody* const softBody );

 	virtual void applyForces( float solverdt );

@@ -276,7 +406,7 @@ private:
 	 */
 	virtual void integrate( float solverdt );

-	void updateConstants( float timeStep );
+	virtual void updateConstants( float timeStep );

 	float computeTriangleArea( 
 		const Vectormath::Aos::Point3 &vertex0,
@@ -292,15 +422,20 @@ private:

 	void updatePositionsFromVelocities( float solverdt );

-	void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
+	virtual void solveLinksForPosition( int startLink, int numLinks, float kst, float ti );
 	
 	void updateVelocitiesFromPositionsWithVelocities( float isolverdt );

 	void updateVelocitiesFromPositionsWithoutVelocities( float isolverdt );
+	void computeBounds( );
+	virtual void solveCollisionsAndUpdateVelocities( float isolverdt );

 	// End kernel dispatches
 	/////////////////////////////////////
 	
+	void updateBounds();
+
+	void releaseKernels();

 public:
 	btOpenCLSoftBodySolver(cl_command_queue queue,cl_context	ctx);
@@ -308,7 +443,8 @@ public:
 	virtual ~btOpenCLSoftBodySolver();


-
+	
+	btOpenCLAcceleratedSoftBodyInterface *findSoftBodyInterface( const btSoftBody* const softBody );

 	virtual btSoftBodyLinkData &getLinkData();

@@ -316,20 +452,27 @@ public:

 	virtual btSoftBodyTriangleData &getTriangleData();

-
+	virtual SolverTypes getSolverType() const
+	{
+		return CL_SOLVER;
+	}


 	virtual bool checkInitialized();

 	virtual void updateSoftBodies( );

-	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies );
+	virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false);
+
+	virtual void copyBackToSoftBodies();

 	virtual void solveConstraints( float solverdt );

 	virtual void predictMotion( float solverdt );

-	virtual void copySoftBodyToVertexBuffer( const btSoftBody *const softBody, btVertexBufferDescriptor *vertexBuffer );
+	virtual void processCollision( btSoftBody *, btCollisionObject* );
+
+	virtual void processCollision( btSoftBody*, btSoftBody* );

 	virtual void	setDefaultWorkgroupSize(size_t workGroupSize)
 	{
@@ -339,6 +482,27 @@ public:
 	{
 		return m_defaultWorkGroupSize;
 	}
+	
 }; // btOpenCLSoftBodySolver

+
+/** 
+ * Class to manage movement of data from a solver to a given target.
+ * This version is the CL to CPU version.
+ */
+class btSoftBodySolverOutputCLtoCPU : public btSoftBodySolverOutput
+{
+protected:
+
+public:
+	btSoftBodySolverOutputCLtoCPU()
+	{
+	}
+
+	/** Output current computed vertex data to the vertex buffers for all cloths in the solver. */
+	virtual void copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer );
+};
+
+
+
 #endif // #ifndef BT_SOFT_BODY_SOLVER_OPENCL_H