diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/ComputeBounds.hlsl b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/ComputeBounds.hlsl new file mode 100644 index 000000000..e21f959cc --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/ComputeBounds.hlsl @@ -0,0 +1,83 @@ +MSTRINGIFY( + +cbuffer ComputeBoundsCB : register( b0 ) +{ + int numNodes; + int numSoftBodies; + int padding1; + int padding2; +}; + +// Node indices for each link +StructuredBuffer g_vertexClothIdentifier : register( t0 ); +StructuredBuffer g_vertexPositions : register( t1 ); + +RWStructuredBuffer g_clothMinBounds : register( u0 ); +RWStructuredBuffer g_clothMaxBounds : register( u1 ); + +groupshared uint4 clothMinBounds[256]; +groupshared uint4 clothMaxBounds[256]; + +[numthreads(128, 1, 1)] +void +ComputeBoundsKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + const unsigned int UINT_MAX = 0xffffffff; + + // Init min and max bounds arrays + if( GTid.x < numSoftBodies ) + { + clothMinBounds[GTid.x] = uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX); + clothMaxBounds[GTid.x] = uint4(0,0,0,0); + } + + AllMemoryBarrierWithGroupSync(); + + int nodeID = DTid.x; + if( nodeID < numNodes ) + { + int clothIdentifier = g_vertexClothIdentifier[nodeID]; + if( clothIdentifier >= 0 ) + { + float3 position = g_vertexPositions[nodeID].xyz; + + // Reinterpret position as uint + uint3 positionUInt = uint3(asuint(position.x), asuint(position.y), asuint(position.z)); + + // Invert sign bit of positives and whole of negatives to allow comparison as unsigned ints + //positionUInt.x ^= uint((-int(positionUInt.x >> 31) | 0x80000000)); + //positionUInt.y ^= uint((-int(positionUInt.y >> 31) | 0x80000000)); + //positionUInt.z ^= uint((-int(positionUInt.z >> 31) | 0x80000000)); + positionUInt.x ^= (1+~(positionUInt.x >> 31) | 0x80000000); + positionUInt.y ^= (1+~(positionUInt.y >> 31) | 0x80000000); + positionUInt.z ^= (1+~(positionUInt.z >> 31) | 0x80000000); + + // Min/max with the LDS values + InterlockedMin(clothMinBounds[clothIdentifier].x, positionUInt.x); + InterlockedMin(clothMinBounds[clothIdentifier].y, positionUInt.y); + InterlockedMin(clothMinBounds[clothIdentifier].z, positionUInt.z); + + InterlockedMax(clothMaxBounds[clothIdentifier].x, positionUInt.x); + InterlockedMax(clothMaxBounds[clothIdentifier].y, positionUInt.y); + InterlockedMax(clothMaxBounds[clothIdentifier].z, positionUInt.z); + } + } + + AllMemoryBarrierWithGroupSync(); + + + // Use global atomics to update the global versions of the data + if( GTid.x < numSoftBodies ) + { + InterlockedMin(g_clothMinBounds[GTid.x].x, clothMinBounds[GTid.x].x); + InterlockedMin(g_clothMinBounds[GTid.x].y, clothMinBounds[GTid.x].y); + InterlockedMin(g_clothMinBounds[GTid.x].z, clothMinBounds[GTid.x].z); + + InterlockedMax(g_clothMaxBounds[GTid.x].x, clothMaxBounds[GTid.x].x); + InterlockedMax(g_clothMaxBounds[GTid.x].y, clothMaxBounds[GTid.x].y); + InterlockedMax(g_clothMaxBounds[GTid.x].z, clothMaxBounds[GTid.x].z); + } +} + + +); \ No newline at end of file diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocities.hlsl b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocities.hlsl new file mode 100644 index 000000000..399912f49 --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocities.hlsl @@ -0,0 +1,170 @@ +MSTRINGIFY( + +cbuffer SolvePositionsFromLinksKernelCB : register( b0 ) +{ + unsigned int numNodes; + float isolverdt; + int padding0; + int padding1; +}; + +struct CollisionObjectIndices +{ + int firstObject; + int endObject; +}; + +struct CollisionShapeDescription +{ + float4x4 shapeTransform; + float4 linearVelocity; + float4 angularVelocity; + + int softBodyIdentifier; + int collisionShapeType; + + + // Shape information + // Compressed from the union + float radius; + float halfHeight; + + float margin; + float friction; + + int padding0; + int padding1; + +}; + +// From btBroadphaseProxy.h +static const int CAPSULE_SHAPE_PROXYTYPE = 10; + +// Node indices for each link +StructuredBuffer g_vertexClothIdentifier : register( t0 ); +StructuredBuffer g_vertexPreviousPositions : register( t1 ); +StructuredBuffer g_perClothFriction : register( t2 ); +StructuredBuffer g_clothDampingFactor : register( t3 ); +StructuredBuffer g_perClothCollisionObjectIndices : register( t4 ); +StructuredBuffer g_collisionObjectDetails : register( t5 ); + +RWStructuredBuffer g_vertexForces : register( u0 ); +RWStructuredBuffer g_vertexVelocities : register( u1 ); +RWStructuredBuffer g_vertexPositions : register( u2 ); + +[numthreads(128, 1, 1)] +void +SolveCollisionsAndUpdateVelocitiesKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + int nodeID = DTid.x; + float3 forceOnVertex = float3(0.f, 0.f, 0.f); + if( DTid.x < numNodes ) + { + int clothIdentifier = g_vertexClothIdentifier[nodeID]; + float4 position = float4(g_vertexPositions[nodeID].xyz, 1.f); + float4 previousPosition = float4(g_vertexPreviousPositions[nodeID].xyz, 1.f); + float3 velocity; + float clothFriction = g_perClothFriction[clothIdentifier]; + float dampingFactor = g_clothDampingFactor[clothIdentifier]; + float velocityCoefficient = (1.f - dampingFactor); + CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier]; + + if( collisionObjectIndices.firstObject != collisionObjectIndices.endObject ) + { + velocity = float3(15, 0, 0); + + // We have some possible collisions to deal with + for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision ) + { + CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision]; + float colliderFriction = shapeDescription.friction; + + if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE ) + { + // Colliding with a capsule + + float capsuleHalfHeight = shapeDescription.halfHeight; + float capsuleRadius = shapeDescription.radius; + float capsuleMargin = shapeDescription.margin; + float4x4 worldTransform = shapeDescription.shapeTransform; + + float4 c1 = float4(0.f, -capsuleHalfHeight, 0.f, 1.f); + float4 c2 = float4(0.f, +capsuleHalfHeight, 0.f, 1.f); + float4 worldC1 = mul(worldTransform, c1); + float4 worldC2 = mul(worldTransform, c2); + float3 segment = (worldC2 - worldC1).xyz; + + // compute distance of tangent to vertex along line segment in capsule + float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) ); + + float4 closestPoint = (worldC1 + float4(segment * distanceAlongSegment, 0.f)); + float distanceFromLine = length(position - closestPoint); + float distanceFromC1 = length(worldC1 - position); + float distanceFromC2 = length(worldC2 - position); + + // Final distance from collision, point to push from, direction to push in + // for impulse force + float dist; + float3 normalVector; + if( distanceAlongSegment < 0 ) + { + dist = distanceFromC1; + normalVector = normalize(position - worldC1).xyz; + } else if( distanceAlongSegment > 1.f ) { + dist = distanceFromC2; + normalVector = normalize(position - worldC2).xyz; + } else { + dist = distanceFromLine; + normalVector = normalize(position - closestPoint).xyz; + } + + float3 colliderLinearVelocity = shapeDescription.linearVelocity.xyz; + float3 colliderAngularVelocity = shapeDescription.angularVelocity.xyz; + float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz - worldTransform._m03_m13_m23); + + float minDistance = capsuleRadius + capsuleMargin; + + // In case of no collision, this is the value of velocity + velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt; + + + // Check for a collision + if( dist < minDistance ) + { + // Project back to surface along normal + position = position + float4((minDistance - dist)*normalVector*0.9, 0.f); + velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt; + float3 relativeVelocity = velocity - velocityOfSurfacePoint; + + float3 p1 = normalize(cross(normalVector, segment)); + float3 p2 = normalize(cross(p1, normalVector)); + // Full friction is sum of velocities in each direction of plane + float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2); + + // Real friction is peak friction corrected by friction coefficients + frictionVector = frictionVector * (colliderFriction*clothFriction); + + float approachSpeed = dot(relativeVelocity, normalVector); + + if( approachSpeed <= 0.0 ) + forceOnVertex -= frictionVector; + } + + } + } + } else { + // Update velocity + float3 difference = position.xyz - previousPosition.xyz; + velocity = difference*velocityCoefficient*isolverdt; + } + + g_vertexVelocities[nodeID] = float4(velocity, 0.f); + + // Update external force + g_vertexForces[nodeID] = float4(forceOnVertex, 0.f); + + g_vertexPositions[nodeID] = float4(position.xyz, 0.f); + } +} + +); diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocitiesSIMDBatched.hlsl b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocitiesSIMDBatched.hlsl new file mode 100644 index 000000000..9bb7e4da1 --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/solveCollisionsAndUpdateVelocitiesSIMDBatched.hlsl @@ -0,0 +1,191 @@ +MSTRINGIFY( + +cbuffer SolvePositionsFromLinksKernelCB : register( b0 ) +{ + unsigned int numNodes; + float isolverdt; + int padding0; + int padding1; +}; + +struct CollisionObjectIndices +{ + int firstObject; + int endObject; +}; + +struct CollisionShapeDescription +{ + float4x4 shapeTransform; + float4 linearVelocity; + float4 angularVelocity; + + int softBodyIdentifier; + int collisionShapeType; + + + // Shape information + // Compressed from the union + float radius; + float halfHeight; + + float margin; + float friction; + + int padding0; + int padding1; + +}; + +// From btBroadphaseProxy.h +static const int CAPSULE_SHAPE_PROXYTYPE = 10; + +// Node indices for each link +StructuredBuffer g_vertexClothIdentifier : register( t0 ); +StructuredBuffer g_vertexPreviousPositions : register( t1 ); +StructuredBuffer g_perClothFriction : register( t2 ); +StructuredBuffer g_clothDampingFactor : register( t3 ); +StructuredBuffer g_perClothCollisionObjectIndices : register( t4 ); +StructuredBuffer g_collisionObjectDetails : register( t5 ); + +RWStructuredBuffer g_vertexForces : register( u0 ); +RWStructuredBuffer g_vertexVelocities : register( u1 ); +RWStructuredBuffer g_vertexPositions : register( u2 ); + +// A buffer of local collision shapes +// TODO: Iterate to support more than 16 +groupshared CollisionShapeDescription localCollisionShapes[16]; + +[numthreads(128, 1, 1)] +void +SolveCollisionsAndUpdateVelocitiesKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + int nodeID = DTid.x; + float3 forceOnVertex = float3(0.f, 0.f, 0.f); + + int clothIdentifier = g_vertexClothIdentifier[nodeID]; + float4 position = float4(g_vertexPositions[nodeID].xyz, 1.f); + float4 previousPosition = float4(g_vertexPreviousPositions[nodeID].xyz, 1.f); + float3 velocity; + float clothFriction = g_perClothFriction[clothIdentifier]; + float dampingFactor = g_clothDampingFactor[clothIdentifier]; + float velocityCoefficient = (1.f - dampingFactor); + CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier]; + + int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject; + if( numObjects > 0 ) + { + // We have some possible collisions to deal with + + // First load all of the collision objects into LDS + int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject; + if( GTid.x < numObjects ) + { + localCollisionShapes[GTid.x] = g_collisionObjectDetails[ collisionObjectIndices.firstObject + GTid.x ]; + } + } + + // Safe as the vertices are padded so that not more than one soft body is in a group + AllMemoryBarrierWithGroupSync(); + + // Annoyingly, even though I know the flow control is not varying, the compiler will not let me skip this + if( numObjects > 0 ) + { + velocity = float3(0, 0, 0); + + + // We have some possible collisions to deal with + for( int collision = 0; collision < numObjects; ++collision ) + { + CollisionShapeDescription shapeDescription = localCollisionShapes[collision]; + float colliderFriction = shapeDescription.friction; + + if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE ) + { + // Colliding with a capsule + + float capsuleHalfHeight = localCollisionShapes[collision].halfHeight; + float capsuleRadius = localCollisionShapes[collision].radius; + float capsuleMargin = localCollisionShapes[collision].margin; + + float4x4 worldTransform = localCollisionShapes[collision].shapeTransform; + + float4 c1 = float4(0.f, -capsuleHalfHeight, 0.f, 1.f); + float4 c2 = float4(0.f, +capsuleHalfHeight, 0.f, 1.f); + float4 worldC1 = mul(worldTransform, c1); + float4 worldC2 = mul(worldTransform, c2); + float3 segment = (worldC2 - worldC1).xyz; + + // compute distance of tangent to vertex along line segment in capsule + float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) ); + + float4 closestPoint = (worldC1 + float4(segment * distanceAlongSegment, 0.f)); + float distanceFromLine = length(position - closestPoint); + float distanceFromC1 = length(worldC1 - position); + float distanceFromC2 = length(worldC2 - position); + + // Final distance from collision, point to push from, direction to push in + // for impulse force + float dist; + float3 normalVector; + if( distanceAlongSegment < 0 ) + { + dist = distanceFromC1; + normalVector = normalize(position - worldC1).xyz; + } else if( distanceAlongSegment > 1.f ) { + dist = distanceFromC2; + normalVector = normalize(position - worldC2).xyz; + } else { + dist = distanceFromLine; + normalVector = normalize(position - closestPoint).xyz; + } + + float3 colliderLinearVelocity = localCollisionShapes[collision].linearVelocity.xyz; + float3 colliderAngularVelocity = localCollisionShapes[collision].angularVelocity.xyz; + float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz - worldTransform._m03_m13_m23); + + float minDistance = capsuleRadius + capsuleMargin; + + // In case of no collision, this is the value of velocity + velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt; + + + // Check for a collision + if( dist < minDistance ) + { + // Project back to surface along normal + position = position + float4((minDistance - dist)*normalVector*0.9, 0.f); + velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt; + float3 relativeVelocity = velocity - velocityOfSurfacePoint; + + float3 p1 = normalize(cross(normalVector, segment)); + float3 p2 = normalize(cross(p1, normalVector)); + // Full friction is sum of velocities in each direction of plane + float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2); + + // Real friction is peak friction corrected by friction coefficients + frictionVector = frictionVector * (colliderFriction*clothFriction); + + float approachSpeed = dot(relativeVelocity, normalVector); + + if( approachSpeed <= 0.0 ) + forceOnVertex -= frictionVector; + } + + } + } + } else { + // Update velocity + float3 difference = position.xyz - previousPosition.xyz; + velocity = difference*velocityCoefficient*isolverdt; + } + + g_vertexVelocities[nodeID] = float4(velocity, 0.f); + + // Update external force + g_vertexForces[nodeID] = float4(forceOnVertex, 0.f); + + g_vertexPositions[nodeID] = float4(position.xyz, 0.f); +} + +); diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ComputeBounds.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ComputeBounds.cl new file mode 100644 index 000000000..15c0cdc67 --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/ComputeBounds.cl @@ -0,0 +1,80 @@ +MSTRINGIFY( +#pragma OPENCL EXTENSION cl_amd_printf : enable \n + + + +__kernel void +ComputeBoundsKernel( + int numNodes, + int numSoftBodies, + __global int * g_vertexClothIdentifier, + __global float4 * g_vertexPositions, + volatile __global uint * g_clothMinBounds, + volatile __global uint * g_clothMaxBounds, + volatile __local uint * clothMinBounds, + volatile __local uint * clothMaxBounds) +{ + // Init min and max bounds arrays + if( get_local_id(0) < numSoftBodies ) + { + + clothMinBounds[get_local_id(0)*4] = UINT_MAX; + clothMinBounds[get_local_id(0)*4+1] = UINT_MAX; + clothMinBounds[get_local_id(0)*4+2] = UINT_MAX; + clothMinBounds[get_local_id(0)*4+3] = UINT_MAX; + clothMaxBounds[get_local_id(0)*4] = 0; + clothMaxBounds[get_local_id(0)*4+1] = 0; + clothMaxBounds[get_local_id(0)*4+2] = 0; + clothMaxBounds[get_local_id(0)*4+3] = 0; + + } + + + barrier(CLK_GLOBAL_MEM_FENCE); + + int nodeID = get_global_id(0); + if( nodeID < numNodes ) + { + int clothIdentifier = g_vertexClothIdentifier[get_global_id(0)]; + if( clothIdentifier >= 0 ) + { + float3 position = g_vertexPositions[get_global_id(0)].xyz; + + /* Reinterpret position as uint */ + uint3 positionUInt = (uint3)(as_uint(position.x), as_uint(position.y), as_uint(position.z)); + + /* Invert sign bit of positives and whole of negatives to allow comparison as unsigned ints */ + positionUInt.x ^= (1+~(positionUInt.x >> 31) | 0x80000000); + positionUInt.y ^= (1+~(positionUInt.y >> 31) | 0x80000000); + positionUInt.z ^= (1+~(positionUInt.z >> 31) | 0x80000000); + + /* Min/max with the LDS values */ + atomic_min(&(clothMinBounds[clothIdentifier*4]), positionUInt.x); + atomic_min(&(clothMinBounds[clothIdentifier*4+1]), positionUInt.y); + atomic_min(&(clothMinBounds[clothIdentifier*4+2]), positionUInt.z); + + atomic_max(&(clothMaxBounds[clothIdentifier*4]), positionUInt.x); + atomic_max(&(clothMaxBounds[clothIdentifier*4+1]), positionUInt.y); + atomic_max(&(clothMaxBounds[clothIdentifier*4+2]), positionUInt.z); + } + } + + barrier(CLK_GLOBAL_MEM_FENCE); + + + /* Use global atomics to update the global versions of the data*/ + if( get_local_id(0) < numSoftBodies ) + { + /*atomic_min(&(g_clothMinBounds[get_local_id(0)].x), clothMinBounds[get_local_id(0)].x);*/ + atomic_min(&(g_clothMinBounds[get_local_id(0)*4]), clothMinBounds[get_local_id(0)*4]); + atomic_min(&(g_clothMinBounds[get_local_id(0)*4+1]), clothMinBounds[get_local_id(0)*4+1]); + atomic_min(&(g_clothMinBounds[get_local_id(0)*4+2]), clothMinBounds[get_local_id(0)*4+2]); + + atomic_max(&(g_clothMaxBounds[get_local_id(0)*4]), clothMaxBounds[get_local_id(0)*4]); + atomic_max(&(g_clothMaxBounds[get_local_id(0)*4+1]), clothMaxBounds[get_local_id(0)*4+1]); + atomic_max(&(g_clothMaxBounds[get_local_id(0)*4+2]), clothMaxBounds[get_local_id(0)*4+2]); + } +} + + +); diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/OutputToVertexArray.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/OutputToVertexArray.cl new file mode 100644 index 000000000..4bc614c09 --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/OutputToVertexArray.cl @@ -0,0 +1,57 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +cbuffer OutputToVertexArrayCB : register( b0 ) +{ + int startNode; + int numNodes; + int offsetX; + int strideX; + + int offsetN; + int strideN; + int padding1; + int padding2; +}; + + +StructuredBuffer g_nodesx : register( t0 ); +StructuredBuffer g_nodesn : register( t1 ); + +RWStructuredBuffer g_vertexBuffer : register( u0 ); + + +[numthreads(128, 1, 1)] +void +OutputToVertexArrayKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex ) +{ + int nodeID = DTid.x; + if( nodeID < numNodes ) + { + float4 nodeX = g_nodesx[nodeID + startNode]; + float4 nodeN = g_nodesn[nodeID + startNode]; + + // Stride should account for the float->float4 conversion + int positionDestination = nodeID * strideX + offsetX; + g_vertexBuffer[positionDestination] = nodeX.x; + g_vertexBuffer[positionDestination+1] = nodeX.y; + g_vertexBuffer[positionDestination+2] = nodeX.z; + + int normalDestination = nodeID * strideN + offsetN; + g_vertexBuffer[normalDestination] = nodeN.x; + g_vertexBuffer[normalDestination+1] = nodeN.y; + g_vertexBuffer[normalDestination+2] = nodeN.z; + } +} \ No newline at end of file diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositionsSIMDBatched.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositionsSIMDBatched.cl new file mode 100644 index 000000000..28255300e --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/SolvePositionsSIMDBatched.cl @@ -0,0 +1,139 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +MSTRINGIFY( + +float mydot3(float4 a, float4 b) +{ + return a.x*b.x + a.y*b.y + a.z*b.z; +} + +__kernel void +SolvePositionsFromLinksKernel( + const int startWaveInBatch, + const int numWaves, + const float kst, + const float ti, + __global int2 *g_wavefrontBatchCountsVertexCounts, + __global int *g_vertexAddressesPerWavefront, + __global int2 * g_linksVertexIndices, + __global float * g_linksMassLSC, + __global float * g_linksRestLengthSquared, + __global float * g_verticesInverseMass, + __global float4 * g_vertexPositions, + __local int2 *wavefrontBatchCountsVertexCounts, + __local float4 *vertexPositionSharedData, + __local float *vertexInverseMassSharedData) +{ + const int laneInWavefront = (get_global_id(0) & (WAVEFRONT_SIZE-1)); + const int wavefront = startWaveInBatch + (get_global_id(0) / WAVEFRONT_SIZE); + const int firstWavefrontInBlock = startWaveInBatch + get_group_id(0) * WAVEFRONT_BLOCK_MULTIPLIER; + const int localWavefront = wavefront - firstWavefrontInBlock; + + // Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier + if( wavefront < (startWaveInBatch + numWaves) ) + { + // Load the batch counts for the wavefronts + // Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier + if( laneInWavefront == 0 ) + { + int2 batchesAndVertexCountsWithinWavefront = g_wavefrontBatchCountsVertexCounts[wavefront]; + wavefrontBatchCountsVertexCounts[localWavefront] = batchesAndVertexCountsWithinWavefront; + } + + + mem_fence(CLK_LOCAL_MEM_FENCE); + + + int2 batchesAndVerticesWithinWavefront = wavefrontBatchCountsVertexCounts[localWavefront]; + int batchesWithinWavefront = batchesAndVerticesWithinWavefront.x; + int verticesUsedByWave = batchesAndVerticesWithinWavefront.y; + + // Load the vertices for the wavefronts + for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE ) + { + int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex]; + + vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_vertexPositions[vertexAddress]; + vertexInverseMassSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_verticesInverseMass[vertexAddress]; + } + + mem_fence(CLK_LOCAL_MEM_FENCE); + + // Loop through the batches performing the solve on each in LDS + int baseDataLocationForWave = WAVEFRONT_SIZE * wavefront * MAX_BATCHES_PER_WAVE; + + //for( int batch = 0; batch < batchesWithinWavefront; ++batch ) + + int batch = 0; + do + { + int baseDataLocation = baseDataLocationForWave + WAVEFRONT_SIZE * batch; + int locationOfValue = baseDataLocation + laneInWavefront; + + + // These loads should all be perfectly linear across the WF + int2 localVertexIndices = g_linksVertexIndices[locationOfValue]; + float massLSC = g_linksMassLSC[locationOfValue]; + float restLengthSquared = g_linksRestLengthSquared[locationOfValue]; + + // LDS vertex addresses based on logical wavefront number in block and loaded index + int vertexAddress0 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.x; + int vertexAddress1 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.y; + + float4 position0 = vertexPositionSharedData[vertexAddress0]; + float4 position1 = vertexPositionSharedData[vertexAddress1]; + + float inverseMass0 = vertexInverseMassSharedData[vertexAddress0]; + float inverseMass1 = vertexInverseMassSharedData[vertexAddress1]; + + float4 del = position1 - position0; + float len = mydot3(del, del); + + float k = 0; + if( massLSC > 0.0f ) + { + k = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst; + } + + position0 = position0 - del*(k*inverseMass0); + position1 = position1 + del*(k*inverseMass1); + + // Ensure compiler does not re-order memory operations + mem_fence(CLK_LOCAL_MEM_FENCE); + + vertexPositionSharedData[vertexAddress0] = position0; + vertexPositionSharedData[vertexAddress1] = position1; + + // Ensure compiler does not re-order memory operations + mem_fence(CLK_LOCAL_MEM_FENCE); + + + ++batch; + } while( batch < batchesWithinWavefront ); + + // Update the global memory vertices for the wavefronts + for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE ) + { + int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex]; + + g_vertexPositions[vertexAddress] = (float4)(vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex].xyz, 0.f); + } + + } + +} + +); diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocities.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocities.cl new file mode 100644 index 000000000..9f50da8a4 --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocities.cl @@ -0,0 +1,195 @@ +MSTRINGIFY( + +typedef struct +{ + int firstObject; + int endObject; +} CollisionObjectIndices; + +typedef struct +{ + float4 shapeTransform[4]; // column major 4x4 matrix + float4 linearVelocity; + float4 angularVelocity; + + int softBodyIdentifier; + int collisionShapeType; + + + // Shape information + // Compressed from the union + float radius; + float halfHeight; + int upAxis; + + float margin; + float friction; + + int padding0; + +} CollisionShapeDescription; + +/* From btBroadphaseProxy.h */ +__constant int CAPSULE_SHAPE_PROXYTYPE = 10; + +/* Multiply column-major matrix against vector */ +float4 matrixVectorMul( float4 matrix[4], float4 vector ) +{ + float4 returnVector; + float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x); + float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y); + float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z); + float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w); + returnVector.x = dot(row0, vector); + returnVector.y = dot(row1, vector); + returnVector.z = dot(row2, vector); + returnVector.w = dot(row3, vector); + return returnVector; +} + +__kernel void +SolveCollisionsAndUpdateVelocitiesKernel( + const int numNodes, + const float isolverdt, + __global int *g_vertexClothIdentifier, + __global float4 *g_vertexPreviousPositions, + __global float * g_perClothFriction, + __global float * g_clothDampingFactor, + __global CollisionObjectIndices * g_perClothCollisionObjectIndices, + __global CollisionShapeDescription * g_collisionObjectDetails, + __global float4 * g_vertexForces, + __global float4 *g_vertexVelocities, + __global float4 *g_vertexPositions) +{ + int nodeID = get_global_id(0); + float3 forceOnVertex = (float3)(0.f, 0.f, 0.f); + if( get_global_id(0) < numNodes ) + { + int clothIdentifier = g_vertexClothIdentifier[nodeID]; + + // Abort if this is not a valid cloth + if( clothIdentifier < 0 ) + return; + + float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f); + float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f); + float3 velocity; + float clothFriction = g_perClothFriction[clothIdentifier]; + float dampingFactor = g_clothDampingFactor[clothIdentifier]; + float velocityCoefficient = (1.f - dampingFactor); + CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier]; + + if( collisionObjectIndices.firstObject != collisionObjectIndices.endObject ) + { + velocity = (float3)(15, 0, 0); + + /* We have some possible collisions to deal with */ + for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision ) + { + CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision]; + float colliderFriction = shapeDescription.friction; + + if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE ) + { + /* Colliding with a capsule */ + + float capsuleHalfHeight = shapeDescription.halfHeight; + float capsuleRadius = shapeDescription.radius; + float capsuleMargin = shapeDescription.margin; + int capsuleupAxis = shapeDescription.upAxis; + + /* Four columns of worldTransform matrix */ + float4 worldTransform[4]; + worldTransform[0] = shapeDescription.shapeTransform[0]; + worldTransform[1] = shapeDescription.shapeTransform[1]; + worldTransform[2] = shapeDescription.shapeTransform[2]; + worldTransform[3] = shapeDescription.shapeTransform[3]; + + // Correctly define capsule centerline vector + float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); + float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f); + c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 ); + c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 ); + c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 ); + c2.x = -c1.x; + c2.y = -c1.y; + c2.z = -c1.z; + + float4 worldC1 = matrixVectorMul(worldTransform, c1); + float4 worldC2 = matrixVectorMul(worldTransform, c2); + float3 segment = (worldC2 - worldC1).xyz; + + /* compute distance of tangent to vertex along line segment in capsule */ + float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) ); + + float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment, 0.f)); + float distanceFromLine = length(position - closestPoint); + float distanceFromC1 = length(worldC1 - position); + float distanceFromC2 = length(worldC2 - position); + + /* Final distance from collision, point to push from, direction to push in + for impulse force */ + float dist; + float3 normalVector; + if( distanceAlongSegment < 0 ) + { + dist = distanceFromC1; + normalVector = normalize(position - worldC1).xyz; + } else if( distanceAlongSegment > 1.f ) { + dist = distanceFromC2; + normalVector = normalize(position - worldC2).xyz; + } else { + dist = distanceFromLine; + normalVector = normalize(position - closestPoint).xyz; + } + + float3 colliderLinearVelocity = shapeDescription.linearVelocity.xyz; + float3 colliderAngularVelocity = shapeDescription.angularVelocity.xyz; + float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz - (float3)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w)); + + float minDistance = capsuleRadius + capsuleMargin; + + /* In case of no collision, this is the value of velocity */ + velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt; + + + // Check for a collision + if( dist < minDistance ) + { + /* Project back to surface along normal */ + position = position + (float4)((minDistance - dist)*normalVector*0.9f, 0.f); + velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt; + float3 relativeVelocity = velocity - velocityOfSurfacePoint; + + float3 p1 = normalize(cross(normalVector, segment)); + float3 p2 = normalize(cross(p1, normalVector)); + /* Full friction is sum of velocities in each direction of plane */ + float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2); + + /* Real friction is peak friction corrected by friction coefficients */ + frictionVector = frictionVector * (colliderFriction*clothFriction); + + float approachSpeed = dot(relativeVelocity, normalVector); + + if( approachSpeed <= 0.0f ) + forceOnVertex -= frictionVector; + } + + } + } + } else { + /* Update velocity */ + float3 difference = position.xyz - previousPosition.xyz; + velocity = difference*velocityCoefficient*isolverdt; + } + + g_vertexVelocities[nodeID] = (float4)(velocity, 0.f); + + /* Update external force */ + g_vertexForces[nodeID] = (float4)(forceOnVertex, 0.f); + + g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f); + } +} + +); diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl new file mode 100644 index 000000000..5ab2a6208 --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl @@ -0,0 +1,213 @@ +MSTRINGIFY( + +typedef struct +{ + int firstObject; + int endObject; +} CollisionObjectIndices; + +typedef struct +{ + float4 shapeTransform[4]; /* column major 4x4 matrix */ + float4 linearVelocity; + float4 angularVelocity; + + int softBodyIdentifier; + int collisionShapeType; + + + // Shape information + // Compressed from the union + float radius; + float halfHeight; + int upAxis; + + float margin; + float friction; + + int padding0; + +} CollisionShapeDescription; + +/* From btBroadphaseProxy.h */ +__constant int CAPSULE_SHAPE_PROXYTYPE = 10; + + +/* Multiply column-major matrix against vector */ +float4 matrixVectorMul( float4 matrix[4], float4 vector ) +{ + float4 returnVector; + float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x); + float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y); + float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z); + float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w); + returnVector.x = dot(row0, vector); + returnVector.y = dot(row1, vector); + returnVector.z = dot(row2, vector); + returnVector.w = dot(row3, vector); + return returnVector; +} + +__kernel void +SolveCollisionsAndUpdateVelocitiesKernel( + const int numNodes, + const float isolverdt, + __global int *g_vertexClothIdentifier, + __global float4 *g_vertexPreviousPositions, + __global float * g_perClothFriction, + __global float * g_clothDampingFactor, + __global CollisionObjectIndices * g_perClothCollisionObjectIndices, + __global CollisionShapeDescription * g_collisionObjectDetails, + __global float4 * g_vertexForces, + __global float4 *g_vertexVelocities, + __global float4 *g_vertexPositions, + __local CollisionShapeDescription *localCollisionShapes) +{ + int nodeID = get_global_id(0); + float3 forceOnVertex = (float3)(0.f, 0.f, 0.f); + + int clothIdentifier = g_vertexClothIdentifier[nodeID]; + + // Abort if this is not a valid cloth + if( clothIdentifier < 0 ) + return; + + float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f); + float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f); + float3 velocity; + float clothFriction = g_perClothFriction[clothIdentifier]; + float dampingFactor = g_clothDampingFactor[clothIdentifier]; + float velocityCoefficient = (1.f - dampingFactor); + CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier]; + + int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject; + if( numObjects > 0 ) + { + /* We have some possible collisions to deal with */ + + /* First load all of the collision objects into LDS */ + int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject; + if( get_local_id(0) < numObjects ) + { + localCollisionShapes[get_local_id(0)] = g_collisionObjectDetails[ collisionObjectIndices.firstObject + get_local_id(0) ]; + } + } + + /* Safe as the vertices are padded so that not more than one soft body is in a group */ + barrier(CLK_LOCAL_MEM_FENCE); + + /* Annoyingly, even though I know the flow control is not varying, the compiler will not let me skip this */ + if( numObjects > 0 ) + { + velocity = (float3)(0, 0, 0); + + + // We have some possible collisions to deal with + for( int collision = 0; collision < numObjects; ++collision ) + { + CollisionShapeDescription shapeDescription = localCollisionShapes[collision]; + float colliderFriction = shapeDescription.friction; + + if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE ) + { + /* Colliding with a capsule */ + + float capsuleHalfHeight = localCollisionShapes[collision].halfHeight; + float capsuleRadius = localCollisionShapes[collision].radius; + float capsuleMargin = localCollisionShapes[collision].margin; + int capsuleupAxis = localCollisionShapes[collision].upAxis; + + float4 worldTransform[4]; + worldTransform[0] = localCollisionShapes[collision].shapeTransform[0]; + worldTransform[1] = localCollisionShapes[collision].shapeTransform[1]; + worldTransform[2] = localCollisionShapes[collision].shapeTransform[2]; + worldTransform[3] = localCollisionShapes[collision].shapeTransform[3]; + + // Correctly define capsule centerline vector + float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); + float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f); + c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 ); + c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 ); + c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 ); + c2.x = -c1.x; + c2.y = -c1.y; + c2.z = -c1.z; + + float4 worldC1 = matrixVectorMul(worldTransform, c1); + float4 worldC2 = matrixVectorMul(worldTransform, c2); + float3 segment = (worldC2 - worldC1).xyz; + + + /* compute distance of tangent to vertex along line segment in capsule */ + float distanceAlongSegment = -( dot( (worldC1 - position).xyz, segment ) / dot(segment, segment) ); + + float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment, 0.f)); + float distanceFromLine = length(position - closestPoint); + float distanceFromC1 = length(worldC1 - position); + float distanceFromC2 = length(worldC2 - position); + + /* Final distance from collision, point to push from, direction to push in + for impulse force */ + float dist; + float3 normalVector; + if( distanceAlongSegment < 0 ) + { + dist = distanceFromC1; + normalVector = normalize(position - worldC1).xyz; + } else if( distanceAlongSegment > 1.f ) { + dist = distanceFromC2; + normalVector = normalize(position - worldC2).xyz; + } else { + dist = distanceFromLine; + normalVector = normalize(position - closestPoint).xyz; + } + + float3 colliderLinearVelocity = localCollisionShapes[collision].linearVelocity.xyz; + float3 colliderAngularVelocity = localCollisionShapes[collision].angularVelocity.xyz; + float3 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position.xyz - (float3)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w)); + + float minDistance = capsuleRadius + capsuleMargin; + + /* In case of no collision, this is the value of velocity */ + velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt; + + + /* Check for a collision */ + if( dist < minDistance ) + { + /* Project back to surface along normal */ + position = position + (float4)((minDistance - dist)*normalVector*0.9f, 0.f); + velocity = (position - previousPosition).xyz * velocityCoefficient * isolverdt; + float3 relativeVelocity = velocity - velocityOfSurfacePoint; + + float3 p1 = normalize(cross(normalVector, segment)); + float3 p2 = normalize(cross(p1, normalVector)); + /* Full friction is sum of velocities in each direction of plane */ + float3 frictionVector = p1*dot(relativeVelocity, p1) + p2*dot(relativeVelocity, p2); + + /* Real friction is peak friction corrected by friction coefficients */ + frictionVector = frictionVector * (colliderFriction*clothFriction); + + float approachSpeed = dot(relativeVelocity, normalVector); + + if( approachSpeed <= 0.0f ) + forceOnVertex -= frictionVector; + } + + } + } + } else { + /* Update velocity */ + float3 difference = position.xyz - previousPosition.xyz; + velocity = difference*velocityCoefficient*isolverdt; + } + + g_vertexVelocities[nodeID] = (float4)(velocity, 0.f); + + /* Update external force */ + g_vertexForces[nodeID] = (float4)(forceOnVertex, 0.f); + + g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f); +} + +); diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ComputeBounds.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ComputeBounds.cl new file mode 100644 index 000000000..f18eada1b --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/ComputeBounds.cl @@ -0,0 +1,82 @@ +MSTRINGIFY( +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n + +__kernel void +ComputeBoundsKernel( + const int numNodes, + const int numSoftBodies, + __global int * g_vertexClothIdentifier, + __global float4 * g_vertexPositions, + /* Unfortunately, to get the atomics below to work these arrays cannot be */ + /* uint4, though that is the layout of the data */ + /* Therefore this is little-endian-only code */ + volatile __global uint * g_clothMinBounds, + volatile __global uint * g_clothMaxBounds, + volatile __local uint * clothMinBounds, + volatile __local uint * clothMaxBounds) +{ + // Init min and max bounds arrays + if( get_local_id(0) < numSoftBodies ) + { + + clothMinBounds[get_local_id(0)*4] = UINT_MAX; + clothMinBounds[get_local_id(0)*4+1] = UINT_MAX; + clothMinBounds[get_local_id(0)*4+2] = UINT_MAX; + clothMinBounds[get_local_id(0)*4+3] = UINT_MAX; + clothMaxBounds[get_local_id(0)*4] = 0; + clothMaxBounds[get_local_id(0)*4+1] = 0; + clothMaxBounds[get_local_id(0)*4+2] = 0; + clothMaxBounds[get_local_id(0)*4+3] = 0; + + } + + barrier(CLK_LOCAL_MEM_FENCE); + + int nodeID = get_global_id(0); + if( nodeID < numNodes ) + { + int clothIdentifier = g_vertexClothIdentifier[nodeID]; + if( clothIdentifier >= 0 ) + { + + float4 position = (float4)(g_vertexPositions[nodeID].xyz, 0.f); + + /* Reinterpret position as uint */ + uint4 positionUInt = (uint4)(as_uint(position.x), as_uint(position.y), as_uint(position.z), 0); + + /* Invert sign bit of positives and whole of negatives to allow comparison as unsigned ints */ + positionUInt.x ^= (1+~(positionUInt.x >> 31) | 0x80000000); + positionUInt.y ^= (1+~(positionUInt.y >> 31) | 0x80000000); + positionUInt.z ^= (1+~(positionUInt.z >> 31) | 0x80000000); + + // Min/max with the LDS values + atom_min(&(clothMinBounds[clothIdentifier*4]), positionUInt.x); + atom_min(&(clothMinBounds[clothIdentifier*4+1]), positionUInt.y); + atom_min(&(clothMinBounds[clothIdentifier*4+2]), positionUInt.z); + + atom_max(&(clothMaxBounds[clothIdentifier*4]), positionUInt.x); + atom_max(&(clothMaxBounds[clothIdentifier*4+1]), positionUInt.y); + atom_max(&(clothMaxBounds[clothIdentifier*4+2]), positionUInt.z); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + + /* Use global atomics to update the global versions of the data */ + if( get_local_id(0) < numSoftBodies ) + { + /*atom_min(&(g_clothMinBounds[get_local_id(0)].x), clothMinBounds[get_local_id(0)].x);*/ + atom_min(&(g_clothMinBounds[get_local_id(0)*4]), clothMinBounds[get_local_id(0)*4]); + atom_min(&(g_clothMinBounds[get_local_id(0)*4+1]), clothMinBounds[get_local_id(0)*4+1]); + atom_min(&(g_clothMinBounds[get_local_id(0)*4+2]), clothMinBounds[get_local_id(0)*4+2]); + + atom_max(&(g_clothMaxBounds[get_local_id(0)*4]), clothMaxBounds[get_local_id(0)*4]); + atom_max(&(g_clothMaxBounds[get_local_id(0)*4+1]), clothMaxBounds[get_local_id(0)*4+1]); + atom_max(&(g_clothMaxBounds[get_local_id(0)*4+2]), clothMaxBounds[get_local_id(0)*4+2]); + } +} + + +); diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/OutputToVertexArray.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/OutputToVertexArray.cl new file mode 100644 index 000000000..f04e0926e --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/OutputToVertexArray.cl @@ -0,0 +1,46 @@ +MSTRINGIFY( + +__kernel void +OutputToVertexArrayWithNormalsKernel( + const int startNode, const int numNodes, __global float *g_vertexBuffer, + const int positionOffset, const int positionStride, const __global float4* g_vertexPositions, + const int normalOffset, const int normalStride, const __global float4* g_vertexNormals ) +{ + int nodeID = get_global_id(0); + if( nodeID < numNodes ) + { + float4 position = g_vertexPositions[nodeID + startNode]; + float4 normal = g_vertexNormals[nodeID + startNode]; + + // Stride should account for the float->float4 conversion + int positionDestination = nodeID * positionStride + positionOffset; + g_vertexBuffer[positionDestination] = position.x; + g_vertexBuffer[positionDestination+1] = position.y; + g_vertexBuffer[positionDestination+2] = position.z; + + int normalDestination = nodeID * normalStride + normalOffset; + g_vertexBuffer[normalDestination] = normal.x; + g_vertexBuffer[normalDestination+1] = normal.y; + g_vertexBuffer[normalDestination+2] = normal.z; + } +} + +__kernel void +OutputToVertexArrayWithoutNormalsKernel( + const int startNode, const int numNodes, __global float *g_vertexBuffer, + const int positionOffset, const int positionStride, const __global float4* g_vertexPositions ) +{ + int nodeID = get_global_id(0); + if( nodeID < numNodes ) + { + float4 position = g_vertexPositions[nodeID + startNode]; + + // Stride should account for the float->float4 conversion + int positionDestination = nodeID * positionStride + positionOffset; + g_vertexBuffer[positionDestination] = position.x; + g_vertexBuffer[positionDestination+1] = position.y; + g_vertexBuffer[positionDestination+2] = position.z; + } +} + +); \ No newline at end of file diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositionsSIMDBatched.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositionsSIMDBatched.cl new file mode 100644 index 000000000..aaed72988 --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/SolvePositionsSIMDBatched.cl @@ -0,0 +1,140 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +MSTRINGIFY( + +float mydot3(float4 a, float4 b) +{ + return a.x*b.x + a.y*b.y + a.z*b.z; +} + +__kernel __attribute__((reqd_work_group_size(WAVEFRONT_BLOCK_MULTIPLIER*WAVEFRONT_SIZE, 1, 1))) +void +SolvePositionsFromLinksKernel( + const int startWaveInBatch, + const int numWaves, + const float kst, + const float ti, + __global int2 *g_wavefrontBatchCountsVertexCounts, + __global int *g_vertexAddressesPerWavefront, + __global int2 * g_linksVertexIndices, + __global float * g_linksMassLSC, + __global float * g_linksRestLengthSquared, + __global float * g_verticesInverseMass, + __global float4 * g_vertexPositions, + __local int2 *wavefrontBatchCountsVertexCounts, + __local float4 *vertexPositionSharedData, + __local float *vertexInverseMassSharedData) +{ + const int laneInWavefront = (get_global_id(0) & (WAVEFRONT_SIZE-1)); + const int wavefront = startWaveInBatch + (get_global_id(0) / WAVEFRONT_SIZE); + const int firstWavefrontInBlock = startWaveInBatch + get_group_id(0) * WAVEFRONT_BLOCK_MULTIPLIER; + const int localWavefront = wavefront - firstWavefrontInBlock; + + // Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier + if( wavefront < (startWaveInBatch + numWaves) ) + { + // Load the batch counts for the wavefronts + // Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier + if( laneInWavefront == 0 ) + { + int2 batchesAndVertexCountsWithinWavefront = g_wavefrontBatchCountsVertexCounts[wavefront]; + wavefrontBatchCountsVertexCounts[localWavefront] = batchesAndVertexCountsWithinWavefront; + } + + + barrier(CLK_LOCAL_MEM_FENCE); + + + int2 batchesAndVerticesWithinWavefront = wavefrontBatchCountsVertexCounts[localWavefront]; + int batchesWithinWavefront = batchesAndVerticesWithinWavefront.x; + int verticesUsedByWave = batchesAndVerticesWithinWavefront.y; + + // Load the vertices for the wavefronts + for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE ) + { + int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex]; + + vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_vertexPositions[vertexAddress]; + vertexInverseMassSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_verticesInverseMass[vertexAddress]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Loop through the batches performing the solve on each in LDS + int baseDataLocationForWave = WAVEFRONT_SIZE * wavefront * MAX_BATCHES_PER_WAVE; + + //for( int batch = 0; batch < batchesWithinWavefront; ++batch ) + + int batch = 0; + do + { + int baseDataLocation = baseDataLocationForWave + WAVEFRONT_SIZE * batch; + int locationOfValue = baseDataLocation + laneInWavefront; + + + // These loads should all be perfectly linear across the WF + int2 localVertexIndices = g_linksVertexIndices[locationOfValue]; + float massLSC = g_linksMassLSC[locationOfValue]; + float restLengthSquared = g_linksRestLengthSquared[locationOfValue]; + + // LDS vertex addresses based on logical wavefront number in block and loaded index + int vertexAddress0 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.x; + int vertexAddress1 = MAX_NUM_VERTICES_PER_WAVE * localWavefront + localVertexIndices.y; + + float4 position0 = vertexPositionSharedData[vertexAddress0]; + float4 position1 = vertexPositionSharedData[vertexAddress1]; + + float inverseMass0 = vertexInverseMassSharedData[vertexAddress0]; + float inverseMass1 = vertexInverseMassSharedData[vertexAddress1]; + + float4 del = position1 - position0; + float len = mydot3(del, del); + + float k = 0; + if( massLSC > 0.0f ) + { + k = ((restLengthSquared - len)/(massLSC*(restLengthSquared+len)))*kst; + } + + position0 = position0 - del*(k*inverseMass0); + position1 = position1 + del*(k*inverseMass1); + + // Ensure compiler does not re-order memory operations + barrier(CLK_LOCAL_MEM_FENCE); + + vertexPositionSharedData[vertexAddress0] = position0; + vertexPositionSharedData[vertexAddress1] = position1; + + // Ensure compiler does not re-order memory operations + barrier(CLK_LOCAL_MEM_FENCE); + + + ++batch; + } while( batch < batchesWithinWavefront ); + + // Update the global memory vertices for the wavefronts + for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE ) + { + int vertexAddress = g_vertexAddressesPerWavefront[wavefront*MAX_NUM_VERTICES_PER_WAVE + vertex]; + + g_vertexPositions[vertexAddress] = (float4)(vertexPositionSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex].xyz, 0.f); + } + + } + +} + +); diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocities.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocities.cl new file mode 100644 index 000000000..29b04024f --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocities.cl @@ -0,0 +1,204 @@ +MSTRINGIFY( + +#pragma OPENCL EXTENSION cl_amd_printf : enable\n + +float mydot3(float4 a, float4 b) +{ + return a.x*b.x + a.y*b.y + a.z*b.z; +} + + +typedef struct +{ + int firstObject; + int endObject; +} CollisionObjectIndices; + +typedef struct +{ + float4 shapeTransform[4]; // column major 4x4 matrix + float4 linearVelocity; + float4 angularVelocity; + + int softBodyIdentifier; + int collisionShapeType; + + + // Shape information + // Compressed from the union + float radius; + float halfHeight; + int upAxis; + + float margin; + float friction; + + int padding0; + +} CollisionShapeDescription; + +// From btBroadphaseProxy.h +__constant int CAPSULE_SHAPE_PROXYTYPE = 10; + +// Multiply column-major matrix against vector +float4 matrixVectorMul( float4 matrix[4], float4 vector ) +{ + float4 returnVector; + float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x); + float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y); + float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z); + float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w); + returnVector.x = dot(row0, vector); + returnVector.y = dot(row1, vector); + returnVector.z = dot(row2, vector); + returnVector.w = dot(row3, vector); + return returnVector; +} + +__kernel void +SolveCollisionsAndUpdateVelocitiesKernel( + const int numNodes, + const float isolverdt, + __global int *g_vertexClothIdentifier, + __global float4 *g_vertexPreviousPositions, + __global float * g_perClothFriction, + __global float * g_clothDampingFactor, + __global CollisionObjectIndices * g_perClothCollisionObjectIndices, + __global CollisionShapeDescription * g_collisionObjectDetails, + __global float4 * g_vertexForces, + __global float4 *g_vertexVelocities, + __global float4 *g_vertexPositions) +{ + int nodeID = get_global_id(0); + float4 forceOnVertex = (float4)(0.f, 0.f, 0.f, 0.f); + + if( get_global_id(0) < numNodes ) + { + int clothIdentifier = g_vertexClothIdentifier[nodeID]; + + // Abort if this is not a valid cloth + if( clothIdentifier < 0 ) + return; + + + float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f); + float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f); + + float clothFriction = g_perClothFriction[clothIdentifier]; + float dampingFactor = g_clothDampingFactor[clothIdentifier]; + float velocityCoefficient = (1.f - dampingFactor); + float4 difference = position - previousPosition; + float4 velocity = difference*velocityCoefficient*isolverdt; + + CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier]; + + int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject; + + if( numObjects > 0 ) + { + // We have some possible collisions to deal with + for( int collision = collisionObjectIndices.firstObject; collision < collisionObjectIndices.endObject; ++collision ) + { + CollisionShapeDescription shapeDescription = g_collisionObjectDetails[collision]; + float colliderFriction = shapeDescription.friction; + + if( shapeDescription.collisionShapeType == CAPSULE_SHAPE_PROXYTYPE ) + { + // Colliding with a capsule + + float capsuleHalfHeight = shapeDescription.halfHeight; + float capsuleRadius = shapeDescription.radius; + float capsuleMargin = shapeDescription.margin; + int capsuleupAxis = shapeDescription.upAxis; + + // Four columns of worldTransform matrix + float4 worldTransform[4]; + worldTransform[0] = shapeDescription.shapeTransform[0]; + worldTransform[1] = shapeDescription.shapeTransform[1]; + worldTransform[2] = shapeDescription.shapeTransform[2]; + worldTransform[3] = shapeDescription.shapeTransform[3]; + + // Correctly define capsule centerline vector + float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); + float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f); + c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 ); + c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 ); + c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 ); + c2.x = -c1.x; + c2.y = -c1.y; + c2.z = -c1.z; + + + float4 worldC1 = matrixVectorMul(worldTransform, c1); + float4 worldC2 = matrixVectorMul(worldTransform, c2); + float4 segment = (worldC2 - worldC1); + + // compute distance of tangent to vertex along line segment in capsule + float distanceAlongSegment = -( mydot3( (worldC1 - position), segment ) / mydot3(segment, segment) ); + + float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment)); + float distanceFromLine = length(position - closestPoint); + float distanceFromC1 = length(worldC1 - position); + float distanceFromC2 = length(worldC2 - position); + + // Final distance from collision, point to push from, direction to push in + // for impulse force + float dist; + float4 normalVector; + if( distanceAlongSegment < 0 ) + { + dist = distanceFromC1; + normalVector = (float4)(normalize(position - worldC1).xyz, 0.f); + } else if( distanceAlongSegment > 1.f ) { + dist = distanceFromC2; + normalVector = (float4)(normalize(position - worldC2).xyz, 0.f); + } else { + dist = distanceFromLine; + normalVector = (float4)(normalize(position - closestPoint).xyz, 0.f); + } + + float4 colliderLinearVelocity = shapeDescription.linearVelocity; + float4 colliderAngularVelocity = shapeDescription.angularVelocity; + float4 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position - (float4)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w, 0.f)); + + float minDistance = capsuleRadius + capsuleMargin; + + // In case of no collision, this is the value of velocity + velocity = (position - previousPosition) * velocityCoefficient * isolverdt; + + + // Check for a collision + if( dist < minDistance ) + { + // Project back to surface along normal + position = position + (float4)((minDistance - dist)*normalVector*0.9f); + velocity = (position - previousPosition) * velocityCoefficient * isolverdt; + float4 relativeVelocity = velocity - velocityOfSurfacePoint; + + float4 p1 = normalize(cross(normalVector, segment)); + float4 p2 = normalize(cross(p1, normalVector)); + // Full friction is sum of velocities in each direction of plane + float4 frictionVector = p1*mydot3(relativeVelocity, p1) + p2*mydot3(relativeVelocity, p2); + + // Real friction is peak friction corrected by friction coefficients + frictionVector = frictionVector * (colliderFriction*clothFriction); + + float approachSpeed = dot(relativeVelocity, normalVector); + + if( approachSpeed <= 0.0f ) + forceOnVertex -= frictionVector; + } + } + } + } + + g_vertexVelocities[nodeID] = (float4)(velocity.xyz, 0.f); + + // Update external force + g_vertexForces[nodeID] = (float4)(forceOnVertex.xyz, 0.f); + + g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f); + } +} + +); diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl new file mode 100644 index 000000000..ba57c8869 --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/OpenCLC10/solveCollisionsAndUpdateVelocitiesSIMDBatched.cl @@ -0,0 +1,219 @@ +MSTRINGIFY( +float mydot3(float4 a, float4 b) +{ + return a.x*b.x + a.y*b.y + a.z*b.z; +} + + +typedef struct +{ + int firstObject; + int endObject; +} CollisionObjectIndices; + +typedef struct +{ + float4 shapeTransform[4]; // column major 4x4 matrix + float4 linearVelocity; + float4 angularVelocity; + + int softBodyIdentifier; + int collisionShapeType; + + + // Shape information + // Compressed from the union + float radius; + float halfHeight; + int upAxis; + + float margin; + float friction; + + int padding0; + +} CollisionShapeDescription; + +// From btBroadphaseProxy.h +__constant int CAPSULE_SHAPE_PROXYTYPE = 10; + + +/* Multiply column-major matrix against vector */ +float4 matrixVectorMul( float4 matrix[4], float4 vector ) +{ + float4 returnVector; + float4 row0 = (float4)(matrix[0].x, matrix[1].x, matrix[2].x, matrix[3].x); + float4 row1 = (float4)(matrix[0].y, matrix[1].y, matrix[2].y, matrix[3].y); + float4 row2 = (float4)(matrix[0].z, matrix[1].z, matrix[2].z, matrix[3].z); + float4 row3 = (float4)(matrix[0].w, matrix[1].w, matrix[2].w, matrix[3].w); + returnVector.x = dot(row0, vector); + returnVector.y = dot(row1, vector); + returnVector.z = dot(row2, vector); + returnVector.w = dot(row3, vector); + return returnVector; +} + +__kernel void +SolveCollisionsAndUpdateVelocitiesKernel( + const int numNodes, + const float isolverdt, + __global int *g_vertexClothIdentifier, + __global float4 *g_vertexPreviousPositions, + __global float * g_perClothFriction, + __global float * g_clothDampingFactor, + __global CollisionObjectIndices * g_perClothCollisionObjectIndices, + __global CollisionShapeDescription * g_collisionObjectDetails, + __global float4 * g_vertexForces, + __global float4 *g_vertexVelocities, + __global float4 *g_vertexPositions, + __local CollisionShapeDescription *localCollisionShapes) +{ + int nodeID = get_global_id(0); + float4 forceOnVertex = (float4)(0.f, 0.f, 0.f, 0.f); + + int clothIdentifier = g_vertexClothIdentifier[nodeID]; + + // Abort if this is not a valid cloth + if( clothIdentifier < 0 ) + return; + + + float4 position = (float4)(g_vertexPositions[nodeID].xyz, 1.f); + float4 previousPosition = (float4)(g_vertexPreviousPositions[nodeID].xyz, 1.f); + float clothFriction = g_perClothFriction[clothIdentifier]; + float dampingFactor = g_clothDampingFactor[clothIdentifier]; + float velocityCoefficient = (1.f - dampingFactor); + + // Update velocity + float4 difference = position - previousPosition; + float4 velocity = difference*velocityCoefficient*isolverdt; + CollisionObjectIndices collisionObjectIndices = g_perClothCollisionObjectIndices[clothIdentifier]; + + int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject; + if( numObjects > 0 ) + { + // We have some possible collisions to deal with + + // First load all of the collision objects into LDS + int numObjects = collisionObjectIndices.endObject - collisionObjectIndices.firstObject; + if( get_local_id(0) < numObjects ) + { + localCollisionShapes[get_local_id(0)] = g_collisionObjectDetails[ collisionObjectIndices.firstObject + get_local_id(0) ]; + } + } + + // Safe as the vertices are padded so that not more than one soft body is in a group + barrier(CLK_LOCAL_MEM_FENCE); + + // Annoyingly, even though I know the flow control is not varying, the compiler will not let me skip this + if( numObjects > 0 ) + { + + + // We have some possible collisions to deal with + for( int collision = 0; collision < numObjects; ++collision ) + { + //CollisionShapeDescription shapeDescription = localCollisionShapes[collision]; + float colliderFriction = localCollisionShapes[collision].friction; + + if( localCollisionShapes[collision].collisionShapeType == CAPSULE_SHAPE_PROXYTYPE ) + { + // Colliding with a capsule + + float capsuleHalfHeight = localCollisionShapes[collision].halfHeight; + float capsuleRadius = localCollisionShapes[collision].radius; + float capsuleMargin = localCollisionShapes[collision].margin; + int capsuleupAxis = localCollisionShapes[collision].upAxis; + + float4 worldTransform[4]; + worldTransform[0] = localCollisionShapes[collision].shapeTransform[0]; + worldTransform[1] = localCollisionShapes[collision].shapeTransform[1]; + worldTransform[2] = localCollisionShapes[collision].shapeTransform[2]; + worldTransform[3] = localCollisionShapes[collision].shapeTransform[3]; + + //float4 c1 = (float4)(0.f, -capsuleHalfHeight, 0.f, 1.f); + //float4 c2 = (float4)(0.f, +capsuleHalfHeight, 0.f, 1.f); + // Correctly define capsule centerline vector + float4 c1 = (float4)(0.f, 0.f, 0.f, 1.f); + float4 c2 = (float4)(0.f, 0.f, 0.f, 1.f); + c1.x = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 0 ); + c1.y = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 1 ); + c1.z = select( 0.f, -capsuleHalfHeight, capsuleupAxis == 2 ); + c2.x = -c1.x; + c2.y = -c1.y; + c2.z = -c1.z; + + float4 worldC1 = matrixVectorMul(worldTransform, c1); + float4 worldC2 = matrixVectorMul(worldTransform, c2); + float4 segment = (worldC2 - worldC1); + + + // compute distance of tangent to vertex along line segment in capsule + float distanceAlongSegment = -( mydot3( (worldC1 - position), segment ) / mydot3(segment, segment) ); + + float4 closestPoint = (worldC1 + (float4)(segment * distanceAlongSegment)); + float distanceFromLine = length(position - closestPoint); + float distanceFromC1 = length(worldC1 - position); + float distanceFromC2 = length(worldC2 - position); + + // Final distance from collision, point to push from, direction to push in + // for impulse force + float dist; + float4 normalVector; + if( distanceAlongSegment < 0 ) + { + dist = distanceFromC1; + normalVector = normalize(position - worldC1); + } else if( distanceAlongSegment > 1.f ) { + dist = distanceFromC2; + normalVector = normalize(position - worldC2); + } else { + dist = distanceFromLine; + normalVector = normalize(position - closestPoint); + } + + float4 colliderLinearVelocity = localCollisionShapes[collision].linearVelocity; + float4 colliderAngularVelocity = localCollisionShapes[collision].angularVelocity; + float4 velocityOfSurfacePoint = colliderLinearVelocity + cross(colliderAngularVelocity, position - (float4)(worldTransform[0].w, worldTransform[1].w, worldTransform[2].w, 0.f)); + + float minDistance = capsuleRadius + capsuleMargin; + + // In case of no collision, this is the value of velocity + velocity = (position - previousPosition) * velocityCoefficient * isolverdt; + + + // Check for a collision + if( dist < minDistance ) + { + // Project back to surface along normal + position = position + (float4)((minDistance - dist)*normalVector*0.9f); + velocity = (position - previousPosition) * velocityCoefficient * isolverdt; + float4 relativeVelocity = velocity - velocityOfSurfacePoint; + + float4 p1 = (float4)(normalize(cross(normalVector, segment)).xyz, 0.f); + float4 p2 = (float4)(normalize(cross(p1, normalVector)).xyz, 0.f); + // Full friction is sum of velocities in each direction of plane + float4 frictionVector = p1*mydot3(relativeVelocity, p1) + p2*mydot3(relativeVelocity, p2); + + // Real friction is peak friction corrected by friction coefficients + frictionVector = frictionVector * (colliderFriction*clothFriction); + + float approachSpeed = dot(relativeVelocity, normalVector); + + if( approachSpeed <= 0.0f ) + forceOnVertex -= frictionVector; + } + + } + } + } + + g_vertexVelocities[nodeID] = (float4)(velocity.xyz, 0.f); + + // Update external force + g_vertexForces[nodeID] = (float4)(forceOnVertex.xyz, 0.f); + + g_vertexPositions[nodeID] = (float4)(position.xyz, 0.f); +} + +); diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCLSIMDAware.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCLSIMDAware.h new file mode 100644 index 000000000..5e3511040 --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverLinkData_OpenCLSIMDAware.h @@ -0,0 +1,169 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#include "BulletMultiThreaded/GpuSoftBodySolvers/CPU/btSoftBodySolverData.h" +#include "btSoftBodySolverBuffer_OpenCL.h" + + +#ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_SIMDAWARE_H +#define BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_SIMDAWARE_H + + +class btSoftBodyLinkDataOpenCLSIMDAware : public btSoftBodyLinkData +{ +public: + bool m_onGPU; + + cl_command_queue m_cqCommandQue; + + const int m_wavefrontSize; + const int m_linksPerWorkItem; + const int m_maxLinksPerWavefront; + int m_maxBatchesWithinWave; + int m_maxVerticesWithinWave; + int m_numWavefronts; + + int m_maxVertex; + + struct NumBatchesVerticesPair + { + int numBatches; + int numVertices; + }; + + btAlignedObjectArray m_linksPerWavefront; + btAlignedObjectArray m_numBatchesAndVerticesWithinWaves; + btOpenCLBuffer< NumBatchesVerticesPair > m_clNumBatchesAndVerticesWithinWaves; + + // All arrays here will contain batches of m_maxLinksPerWavefront links + // ordered by wavefront. + // with either global vertex pairs or local vertex pairs + btAlignedObjectArray< int > m_wavefrontVerticesGlobalAddresses; // List of global vertices per wavefront + btOpenCLBuffer m_clWavefrontVerticesGlobalAddresses; + btAlignedObjectArray< LinkNodePair > m_linkVerticesLocalAddresses; // Vertex pair for the link + btOpenCLBuffer m_clLinkVerticesLocalAddresses; + btOpenCLBuffer m_clLinkStrength; + btOpenCLBuffer m_clLinksMassLSC; + btOpenCLBuffer m_clLinksRestLengthSquared; + btOpenCLBuffer m_clLinksRestLength; + btOpenCLBuffer m_clLinksMaterialLinearStiffnessCoefficient; + + struct BatchPair + { + int start; + int length; + + BatchPair() : + start(0), + length(0) + { + } + + BatchPair( int s, int l ) : + start( s ), + length( l ) + { + } + }; + + /** + * Link addressing information for each cloth. + * Allows link locations to be computed independently of data batching. + */ + btAlignedObjectArray< int > m_linkAddresses; + + /** + * Start and length values for computation batches over link data. + */ + btAlignedObjectArray< BatchPair > m_wavefrontBatchStartLengths; + + btSoftBodyLinkDataOpenCLSIMDAware(cl_command_queue queue, cl_context ctx); + + virtual ~btSoftBodyLinkDataOpenCLSIMDAware(); + + /** Allocate enough space in all link-related arrays to fit numLinks links */ + virtual void createLinks( int numLinks ); + + /** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */ + virtual void setLinkAt( + const LinkDescription &link, + int linkIndex ); + + virtual bool onAccelerator(); + + virtual bool moveToAccelerator(); + + virtual bool moveFromAccelerator(); + + /** + * Generate (and later update) the batching for the entire link set. + * This redoes a lot of work because it batches the entire set when each cloth is inserted. + * In theory we could delay it until just before we need the cloth. + * It's a one-off overhead, though, so that is a later optimisation. + */ + void generateBatches(); + + int getMaxVerticesPerWavefront() + { + return m_maxVerticesWithinWave; + } + + int getWavefrontSize() + { + return m_wavefrontSize; + } + + int getLinksPerWorkItem() + { + return m_linksPerWorkItem; + } + + int getMaxLinksPerWavefront() + { + return m_maxLinksPerWavefront; + } + + int getMaxBatchesPerWavefront() + { + return m_maxBatchesWithinWave; + } + + int getNumWavefronts() + { + return m_numWavefronts; + } + + NumBatchesVerticesPair getNumBatchesAndVerticesWithinWavefront( int wavefront ) + { + return m_numBatchesAndVerticesWithinWaves[wavefront]; + } + + int getVertexGlobalAddresses( int vertexIndex ) + { + return m_wavefrontVerticesGlobalAddresses[vertexIndex]; + } + + /** + * Get post-batching local addresses of the vertex pair for a link assuming all vertices used by a wavefront are loaded locally. + */ + LinkNodePair getVertexPairLocalAddresses( int linkIndex ) + { + return m_linkVerticesLocalAddresses[linkIndex]; + } +}; + + + +#endif // #ifndef BT_SOFT_BODY_SOLVER_LINK_DATA_OPENCL_SIMDAWARE_H diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp new file mode 100644 index 000000000..b00795a9a --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.cpp @@ -0,0 +1,133 @@ +#include "btSoftBodySolverOutputCLtoGL.h" +#include //@todo: remove the debugging printf at some stage +#include "btSoftBodySolver_OpenCL.h" +#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h" +#include "btSoftBodySolverVertexBuffer_OpenGL.h" +#include "BulletSoftBody/btSoftBody.h" + +#if (0)//CL_VERSION_1_1 == 1) + //OpenCL 1.1 kernels use float3 +#define MSTRINGIFY(A) #A +static char* OutputToVertexArrayCLString = +#include "OpenCLC/OutputToVertexArray.cl" +#else +////OpenCL 1.0 kernels don't use float3 +#define MSTRINGIFY(A) #A +static char* OutputToVertexArrayCLString = +#include "OpenCLC10/OutputToVertexArray.cl" +#endif //CL_VERSION_1_1 + + +#define RELEASE_CL_KERNEL(kernelName) {if( kernelName ){ clReleaseKernel( kernelName ); kernelName = 0; }} + +static const size_t workGroupSize = 128; + +void btSoftBodySolverOutputCLtoGL::copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer ) +{ + + btSoftBodySolver *solver = softBody->getSoftBodySolver(); + btAssert( solver->getSolverType() == btSoftBodySolver::CL_SOLVER || solver->getSolverType() == btSoftBodySolver::CL_SIMD_SOLVER ); + btOpenCLSoftBodySolver *dxSolver = static_cast< btOpenCLSoftBodySolver * >( solver ); + checkInitialized(); + btOpenCLAcceleratedSoftBodyInterface* currentCloth = dxSolver->findSoftBodyInterface( softBody ); + btSoftBodyVertexDataOpenCL &vertexData( dxSolver->m_vertexData ); + + const int firstVertex = currentCloth->getFirstVertex(); + const int lastVertex = firstVertex + currentCloth->getNumVertices(); + + if( vertexBuffer->getBufferType() == btVertexBufferDescriptor::OPENGL_BUFFER ) { + + const btOpenGLInteropVertexBufferDescriptor *openGLVertexBuffer = static_cast< btOpenGLInteropVertexBufferDescriptor* >(vertexBuffer); + cl_int ciErrNum = CL_SUCCESS; + + cl_mem clBuffer = openGLVertexBuffer->getBuffer(); + cl_kernel outputKernel = outputToVertexArrayWithNormalsKernel; + if( !vertexBuffer->hasNormals() ) + outputKernel = outputToVertexArrayWithoutNormalsKernel; + + ciErrNum = clEnqueueAcquireGLObjects(m_cqCommandQue, 1, &clBuffer, 0, 0, NULL); + if( ciErrNum != CL_SUCCESS ) + { + btAssert( 0 && "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)"); + } + + int numVertices = currentCloth->getNumVertices(); + + ciErrNum = clSetKernelArg(outputKernel, 0, sizeof(int), &firstVertex ); + ciErrNum = clSetKernelArg(outputKernel, 1, sizeof(int), &numVertices ); + ciErrNum = clSetKernelArg(outputKernel, 2, sizeof(cl_mem), (void*)&clBuffer ); + if( vertexBuffer->hasVertexPositions() ) + { + int vertexOffset = vertexBuffer->getVertexOffset(); + int vertexStride = vertexBuffer->getVertexStride(); + ciErrNum = clSetKernelArg(outputKernel, 3, sizeof(int), &vertexOffset ); + ciErrNum = clSetKernelArg(outputKernel, 4, sizeof(int), &vertexStride ); + ciErrNum = clSetKernelArg(outputKernel, 5, sizeof(cl_mem), (void*)&vertexData.m_clVertexPosition.m_buffer ); + + } + if( vertexBuffer->hasNormals() ) + { + int normalOffset = vertexBuffer->getNormalOffset(); + int normalStride = vertexBuffer->getNormalStride(); + ciErrNum = clSetKernelArg(outputKernel, 6, sizeof(int), &normalOffset ); + ciErrNum = clSetKernelArg(outputKernel, 7, sizeof(int), &normalStride ); + ciErrNum = clSetKernelArg(outputKernel, 8, sizeof(cl_mem), (void*)&vertexData.m_clVertexNormal.m_buffer ); + + } + size_t numWorkItems = workGroupSize*((vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize); + ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, outputKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0); + if( ciErrNum != CL_SUCCESS ) + { + btAssert( 0 && "enqueueNDRangeKernel(copySoftBodyToVertexBuffer)"); + } + + ciErrNum = clEnqueueReleaseGLObjects(m_cqCommandQue, 1, &clBuffer, 0, 0, 0); + if( ciErrNum != CL_SUCCESS ) + { + btAssert( 0 && "clEnqueueReleaseGLObjects(copySoftBodyToVertexBuffer)"); + } + } else { + btAssert( "Undefined output for this solver output" == false ); + } + + // clFinish in here may not be the best thing. It's possible that we should have a waitForFrameComplete function. + clFinish(m_cqCommandQue); + +} // btSoftBodySolverOutputCLtoGL::outputToVertexBuffers + +bool btSoftBodySolverOutputCLtoGL::buildShaders() +{ + // Ensure current kernels are released first + releaseKernels(); + + bool returnVal = true; + + if( m_shadersInitialized ) + return true; + + outputToVertexArrayWithNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithNormalsKernel" ); + outputToVertexArrayWithoutNormalsKernel = clFunctions.compileCLKernelFromString( OutputToVertexArrayCLString, "OutputToVertexArrayWithoutNormalsKernel" ); + + + if( returnVal ) + m_shadersInitialized = true; + + return returnVal; +} // btSoftBodySolverOutputCLtoGL::buildShaders + +void btSoftBodySolverOutputCLtoGL::releaseKernels() +{ + RELEASE_CL_KERNEL( outputToVertexArrayWithNormalsKernel ); + RELEASE_CL_KERNEL( outputToVertexArrayWithoutNormalsKernel ); + + m_shadersInitialized = false; +} // btSoftBodySolverOutputCLtoGL::releaseKernels + +bool btSoftBodySolverOutputCLtoGL::checkInitialized() +{ + if( !m_shadersInitialized ) + if( buildShaders() ) + m_shadersInitialized = true; + + return m_shadersInitialized; +} \ No newline at end of file diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.h new file mode 100644 index 000000000..45279b0c9 --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverOutputCLtoGL.h @@ -0,0 +1,62 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef BT_SOFT_BODY_SOLVER_OUTPUT_CL_TO_GL_H +#define BT_SOFT_BODY_SOLVER_OUTPUT_CL_TO_GL_H + +#include "btSoftBodySolver_OpenCL.h" + +/** + * Class to manage movement of data from a solver to a given target. + * This version is the CL to GL interop version. + */ +class btSoftBodySolverOutputCLtoGL : public btSoftBodySolverOutput +{ +protected: + cl_command_queue m_cqCommandQue; + cl_context m_cxMainContext; + CLFunctions clFunctions; + + cl_kernel outputToVertexArrayWithNormalsKernel; + cl_kernel outputToVertexArrayWithoutNormalsKernel; + + bool m_shadersInitialized; + + virtual bool checkInitialized(); + virtual bool buildShaders(); + void releaseKernels(); +public: + btSoftBodySolverOutputCLtoGL(cl_command_queue cqCommandQue, cl_context cxMainContext) : + m_cqCommandQue( cqCommandQue ), + m_cxMainContext( cxMainContext ), + clFunctions(cqCommandQue, cxMainContext), + outputToVertexArrayWithNormalsKernel( 0 ), + outputToVertexArrayWithoutNormalsKernel( 0 ), + m_shadersInitialized( false ) + { + } + + virtual ~btSoftBodySolverOutputCLtoGL() + { + releaseKernels(); + } + + /** Output current computed vertex data to the vertex buffers for all cloths in the solver. */ + virtual void copySoftBodyToVertexBuffer( const btSoftBody * const softBody, btVertexBufferDescriptor *vertexBuffer ); +}; + + + +#endif // #ifndef BT_SOFT_BODY_SOLVER_OUTPUT_CL_TO_GL_H \ No newline at end of file diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexBuffer_OpenGL.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexBuffer_OpenGL.h new file mode 100644 index 000000000..4d5ae31a4 --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolverVertexBuffer_OpenGL.h @@ -0,0 +1,168 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_OPENGL_H +#define BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_OPENGL_H + + +#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h" +#ifdef USE_MINICL + #include "MiniCL/cl.h" +#else //USE_MINICL + #ifdef __APPLE__ + #include + #else + #include + #endif //__APPLE__ +#endif//USE_MINICL +#ifndef USE_MINICL +#include +#endif //USE_MINICL + + +#ifdef _WIN32//for glut.h +#include +#endif + +//think different +#if defined(__APPLE__) && !defined (VMDMESA) +#include +#include +#include +#include +#else + + +#ifdef _WINDOWS +#include +#include +#include +#else +#include +#endif //_WINDOWS +#endif //APPLE + + + +class btOpenGLInteropVertexBufferDescriptor : public btVertexBufferDescriptor +{ +protected: + /** OpenCL context */ + cl_context m_context; + + /** OpenCL command queue */ + cl_command_queue m_commandQueue; + + /** OpenCL interop buffer */ + cl_mem m_buffer; + + /** VBO in GL that is the basis of the interop buffer */ + GLuint m_openGLVBO; + + +public: + /** + * context is the OpenCL context this interop buffer will work in. + * queue is the command queue that kernels and data movement will be enqueued into. + * openGLVBO is the OpenGL vertex buffer data will be copied into. + * vertexOffset is the offset in floats to the first vertex. + * vertexStride is the stride in floats between vertices. + */ + btOpenGLInteropVertexBufferDescriptor( cl_command_queue cqCommandQue, cl_context context, GLuint openGLVBO, int vertexOffset, int vertexStride ) + { +#ifndef USE_MINICL + cl_int ciErrNum = CL_SUCCESS; + m_context = context; + m_commandQueue = cqCommandQue; + + m_vertexOffset = vertexOffset; + m_vertexStride = vertexStride; + + m_openGLVBO = openGLVBO; + + m_buffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, openGLVBO, &ciErrNum); + if( ciErrNum != CL_SUCCESS ) + { + btAssert( 0 && "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)"); + } + + m_hasVertexPositions = true; +#else + btAssert(0);//MiniCL shouldn't get here +#endif + } + + /** + * context is the OpenCL context this interop buffer will work in. + * queue is the command queue that kernels and data movement will be enqueued into. + * openGLVBO is the OpenGL vertex buffer data will be copied into. + * vertexOffset is the offset in floats to the first vertex. + * vertexStride is the stride in floats between vertices. + * normalOffset is the offset in floats to the first normal. + * normalStride is the stride in floats between normals. + */ + btOpenGLInteropVertexBufferDescriptor( cl_command_queue cqCommandQue, cl_context context, GLuint openGLVBO, int vertexOffset, int vertexStride, int normalOffset, int normalStride ) + { +#ifndef USE_MINICL + cl_int ciErrNum = CL_SUCCESS; + m_context = context; + m_commandQueue = cqCommandQue; + + m_openGLVBO = openGLVBO; + + m_buffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, openGLVBO, &ciErrNum); + if( ciErrNum != CL_SUCCESS ) + { + btAssert( 0 && "clEnqueueAcquireGLObjects(copySoftBodyToVertexBuffer)"); + } + + m_vertexOffset = vertexOffset; + m_vertexStride = vertexStride; + m_hasVertexPositions = true; + + m_normalOffset = normalOffset; + m_normalStride = normalStride; + m_hasNormals = true; +#else + btAssert(0); +#endif //USE_MINICL + + } + + virtual ~btOpenGLInteropVertexBufferDescriptor() + { + clReleaseMemObject( m_buffer ); + } + + /** + * Return the type of the vertex buffer descriptor. + */ + virtual BufferTypes getBufferType() const + { + return OPENGL_BUFFER; + } + + virtual cl_context getContext() const + { + return m_context; + } + + virtual cl_mem getBuffer() const + { + return m_buffer; + } +}; + +#endif // #ifndef BT_SOFT_BODY_SOLVER_VERTEX_BUFFER_OPENGL_H \ No newline at end of file diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp new file mode 100644 index 000000000..aeee3364a --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp @@ -0,0 +1,1063 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + +#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h" +#include "vectormath/vmInclude.h" +#include //@todo: remove the debugging printf at some stage +#include "btSoftBodySolver_OpenCLSIMDAware.h" +#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h" +#include "BulletSoftBody/btSoftBody.h" +#include "BulletCollision/CollisionShapes/btCapsuleShape.h" + +#define WAVEFRONT_SIZE 32 +#define WAVEFRONT_BLOCK_MULTIPLIER 2 +#define GROUP_SIZE (WAVEFRONT_SIZE*WAVEFRONT_BLOCK_MULTIPLIER) +#define LINKS_PER_SIMD_LANE 16 + +static const size_t workGroupSize = GROUP_SIZE; + + +//CL_VERSION_1_1 seems broken on NVidia SDK so just disable it + +#if (0)//CL_VERSION_1_1 == 1) + //OpenCL 1.1 kernels use float3 +#define MSTRINGIFY(A) #A +static char* UpdatePositionsFromVelocitiesCLString = +#include "OpenCLC/UpdatePositionsFromVelocities.cl" +static char* SolvePositionsCLString = +#include "OpenCLC/SolvePositionsSIMDBatched.cl" +static char* UpdateNodesCLString = +#include "OpenCLC/UpdateNodes.cl" +static char* UpdatePositionsCLString = +#include "OpenCLC/UpdatePositions.cl" +static char* UpdateConstantsCLString = +#include "OpenCLC/UpdateConstants.cl" +static char* IntegrateCLString = +#include "OpenCLC/Integrate.cl" +static char* ApplyForcesCLString = +#include "OpenCLC/ApplyForces.cl" +static char* UpdateNormalsCLString = +#include "OpenCLC/UpdateNormals.cl" +static char* VSolveLinksCLString = +#include "OpenCLC/VSolveLinks.cl" +static char* ComputeBoundsCLString = +#include "OpenCLC/ComputeBounds.cl" +static char* SolveCollisionsAndUpdateVelocitiesCLString = +#include "OpenCLC/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl" +static char* OutputToVertexArrayCLString = +#include "OpenCLC/OutputToVertexArray.cl" +#else +////OpenCL 1.0 kernels don't use float3 +#define MSTRINGIFY(A) #A +static char* UpdatePositionsFromVelocitiesCLString = +#include "OpenCLC10/UpdatePositionsFromVelocities.cl" +static char* SolvePositionsCLString = +#include "OpenCLC10/SolvePositionsSIMDBatched.cl" +static char* UpdateNodesCLString = +#include "OpenCLC10/UpdateNodes.cl" +static char* UpdatePositionsCLString = +#include "OpenCLC10/UpdatePositions.cl" +static char* UpdateConstantsCLString = +#include "OpenCLC10/UpdateConstants.cl" +static char* IntegrateCLString = +#include "OpenCLC10/Integrate.cl" +static char* ApplyForcesCLString = +#include "OpenCLC10/ApplyForces.cl" +static char* UpdateNormalsCLString = +#include "OpenCLC10/UpdateNormals.cl" +static char* VSolveLinksCLString = +#include "OpenCLC10/VSolveLinks.cl" +static char* ComputeBoundsCLString = +#include "OpenCLC10/ComputeBounds.cl" +static char* SolveCollisionsAndUpdateVelocitiesCLString = +#include "OpenCLC10/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl" +static char* OutputToVertexArrayCLString = +#include "OpenCLC10/OutputToVertexArray.cl" +#endif //CL_VERSION_1_1 + + + +btSoftBodyLinkDataOpenCLSIMDAware::btSoftBodyLinkDataOpenCLSIMDAware(cl_command_queue queue, cl_context ctx) : + m_cqCommandQue(queue), + m_wavefrontSize( WAVEFRONT_SIZE ), + m_linksPerWorkItem( LINKS_PER_SIMD_LANE ), + m_maxBatchesWithinWave( 0 ), + m_maxLinksPerWavefront( m_wavefrontSize * m_linksPerWorkItem ), + m_numWavefronts( 0 ), + m_maxVertex( 0 ), + m_clNumBatchesAndVerticesWithinWaves( queue, ctx, &m_numBatchesAndVerticesWithinWaves, true ), + m_clWavefrontVerticesGlobalAddresses( queue, ctx, &m_wavefrontVerticesGlobalAddresses, true ), + m_clLinkVerticesLocalAddresses( queue, ctx, &m_linkVerticesLocalAddresses, true ), + m_clLinkStrength( queue, ctx, &m_linkStrength, false ), + m_clLinksMassLSC( queue, ctx, &m_linksMassLSC, false ), + m_clLinksRestLengthSquared( queue, ctx, &m_linksRestLengthSquared, false ), + m_clLinksRestLength( queue, ctx, &m_linksRestLength, false ), + m_clLinksMaterialLinearStiffnessCoefficient( queue, ctx, &m_linksMaterialLinearStiffnessCoefficient, false ) +{ +} + +btSoftBodyLinkDataOpenCLSIMDAware::~btSoftBodyLinkDataOpenCLSIMDAware() +{ +} + +static Vectormath::Aos::Vector3 toVector3( const btVector3 &vec ) +{ + Vectormath::Aos::Vector3 outVec( vec.getX(), vec.getY(), vec.getZ() ); + return outVec; +} + +/** Allocate enough space in all link-related arrays to fit numLinks links */ +void btSoftBodyLinkDataOpenCLSIMDAware::createLinks( int numLinks ) +{ + int previousSize = m_links.size(); + int newSize = previousSize + numLinks; + + btSoftBodyLinkData::createLinks( numLinks ); + + // Resize the link addresses array as well + m_linkAddresses.resize( newSize ); +} + +/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */ +void btSoftBodyLinkDataOpenCLSIMDAware::setLinkAt( + const LinkDescription &link, + int linkIndex ) +{ + btSoftBodyLinkData::setLinkAt( link, linkIndex ); + + if( link.getVertex0() > m_maxVertex ) + m_maxVertex = link.getVertex0(); + if( link.getVertex1() > m_maxVertex ) + m_maxVertex = link.getVertex1(); + + // Set the link index correctly for initialisation + m_linkAddresses[linkIndex] = linkIndex; +} + +bool btSoftBodyLinkDataOpenCLSIMDAware::onAccelerator() +{ + return m_onGPU; +} + +bool btSoftBodyLinkDataOpenCLSIMDAware::moveToAccelerator() +{ + bool success = true; + success = success && m_clNumBatchesAndVerticesWithinWaves.moveToGPU(); + success = success && m_clWavefrontVerticesGlobalAddresses.moveToGPU(); + success = success && m_clLinkVerticesLocalAddresses.moveToGPU(); + success = success && m_clLinkStrength.moveToGPU(); + success = success && m_clLinksMassLSC.moveToGPU(); + success = success && m_clLinksRestLengthSquared.moveToGPU(); + success = success && m_clLinksRestLength.moveToGPU(); + success = success && m_clLinksMaterialLinearStiffnessCoefficient.moveToGPU(); + + if( success ) { + m_onGPU = true; + } + + return success; +} + +bool btSoftBodyLinkDataOpenCLSIMDAware::moveFromAccelerator() +{ + bool success = true; + success = success && m_clNumBatchesAndVerticesWithinWaves.moveToGPU(); + success = success && m_clWavefrontVerticesGlobalAddresses.moveToGPU(); + success = success && m_clLinkVerticesLocalAddresses.moveToGPU(); + success = success && m_clLinkStrength.moveFromGPU(); + success = success && m_clLinksMassLSC.moveFromGPU(); + success = success && m_clLinksRestLengthSquared.moveFromGPU(); + success = success && m_clLinksRestLength.moveFromGPU(); + success = success && m_clLinksMaterialLinearStiffnessCoefficient.moveFromGPU(); + + if( success ) { + m_onGPU = false; + } + + return success; +} + + + + + + + + +btOpenCLSoftBodySolverSIMDAware::btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue, cl_context ctx) : + btOpenCLSoftBodySolver( queue, ctx ), + m_linkData(queue, ctx) +{ + // Initial we will clearly need to update solver constants + // For now this is global for the cloths linked with this solver - we should probably make this body specific + // for performance in future once we understand more clearly when constants need to be updated + m_updateSolverConstants = true; + + m_shadersInitialized = false; +} + +btOpenCLSoftBodySolverSIMDAware::~btOpenCLSoftBodySolverSIMDAware() +{ + releaseKernels(); +} + +void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody * > &softBodies ,bool forceUpdate) +{ + if( forceUpdate|| m_softBodySet.size() != softBodies.size() ) + { + // Have a change in the soft body set so update, reloading all the data + getVertexData().clear(); + getTriangleData().clear(); + getLinkData().clear(); + m_softBodySet.resize(0); + + + for( int softBodyIndex = 0; softBodyIndex < softBodies.size(); ++softBodyIndex ) + { + btSoftBody *softBody = softBodies[ softBodyIndex ]; + using Vectormath::Aos::Matrix3; + using Vectormath::Aos::Point3; + + // Create SoftBody that will store the information within the solver + btOpenCLAcceleratedSoftBodyInterface* newSoftBody = new btOpenCLAcceleratedSoftBodyInterface( softBody ); + m_softBodySet.push_back( newSoftBody ); + + m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) ); + m_perClothDampingFactor.push_back(softBody->m_cfg.kDP); + m_perClothVelocityCorrectionCoefficient.push_back( softBody->m_cfg.kVCF ); + m_perClothLiftFactor.push_back( softBody->m_cfg.kLF ); + m_perClothDragFactor.push_back( softBody->m_cfg.kDG ); + m_perClothMediumDensity.push_back(softBody->getWorldInfo()->air_density); + // Simple init values. Actually we'll put 0 and -1 into them at the appropriate time + m_perClothMinBounds.push_back( UIntVector3(UINT_MAX, UINT_MAX, UINT_MAX) ); + m_perClothMaxBounds.push_back( UIntVector3(0, 0, 0) ); + /*m_perClothMinBounds.push_back( UINT_MAX ); + m_perClothMaxBounds.push_back( 0 ); + m_perClothMinBounds.push_back( UINT_MAX ); + m_perClothMaxBounds.push_back( 0 ); + m_perClothMinBounds.push_back( UINT_MAX ); + m_perClothMaxBounds.push_back( 0 ); + m_perClothMinBounds.push_back( UINT_MAX ); + m_perClothMaxBounds.push_back( 0 );*/ + + + m_perClothFriction.push_back( softBody->getFriction() ); + m_perClothCollisionObjects.push_back( CollisionObjectIndices(-1, -1) ); + + // Add space for new vertices and triangles in the default solver for now + // TODO: Include space here for tearing too later + int firstVertex = getVertexData().getNumVertices(); + int numVertices = softBody->m_nodes.size(); + // Round maxVertices to a multiple of the workgroup size so we know we're safe to run over in a given group + // maxVertices can be increased to allow tearing, but should be used sparingly because these extra verts will always be processed + int maxVertices = GROUP_SIZE*((numVertices+GROUP_SIZE)/GROUP_SIZE); + // Allocate space for new vertices in all the vertex arrays + getVertexData().createVertices( numVertices, softBodyIndex, maxVertices ); + + int firstTriangle = getTriangleData().getNumTriangles(); + int numTriangles = softBody->m_faces.size(); + int maxTriangles = numTriangles; + getTriangleData().createTriangles( maxTriangles ); + + // Copy vertices from softbody into the solver + for( int vertex = 0; vertex < numVertices; ++vertex ) + { + Point3 multPoint(softBody->m_nodes[vertex].m_x.getX(), softBody->m_nodes[vertex].m_x.getY(), softBody->m_nodes[vertex].m_x.getZ()); + btSoftBodyVertexData::VertexDescription desc; + + // TODO: Position in the softbody might be pre-transformed + // or we may need to adapt for the pose. + //desc.setPosition( cloth.getMeshTransform()*multPoint ); + desc.setPosition( multPoint ); + + float vertexInverseMass = softBody->m_nodes[vertex].m_im; + desc.setInverseMass(vertexInverseMass); + getVertexData().setVertexAt( desc, firstVertex + vertex ); + } + + // Copy triangles similarly + // We're assuming here that vertex indices are based on the firstVertex rather than the entire scene + for( int triangle = 0; triangle < numTriangles; ++triangle ) + { + // Note that large array storage is relative to the array not to the cloth + // So we need to add firstVertex to each value + int vertexIndex0 = (softBody->m_faces[triangle].m_n[0] - &(softBody->m_nodes[0])); + int vertexIndex1 = (softBody->m_faces[triangle].m_n[1] - &(softBody->m_nodes[0])); + int vertexIndex2 = (softBody->m_faces[triangle].m_n[2] - &(softBody->m_nodes[0])); + btSoftBodyTriangleData::TriangleDescription newTriangle(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, vertexIndex2 + firstVertex); + getTriangleData().setTriangleAt( newTriangle, firstTriangle + triangle ); + + // Increase vertex triangle counts for this triangle + getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex0)++; + getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex1)++; + getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex2)++; + } + + int firstLink = getLinkData().getNumLinks(); + int numLinks = softBody->m_links.size(); + int maxLinks = numLinks; + + // Allocate space for the links + getLinkData().createLinks( numLinks ); + + // Add the links + for( int link = 0; link < numLinks; ++link ) + { + int vertexIndex0 = softBody->m_links[link].m_n[0] - &(softBody->m_nodes[0]); + int vertexIndex1 = softBody->m_links[link].m_n[1] - &(softBody->m_nodes[0]); + + btSoftBodyLinkData::LinkDescription newLink(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, softBody->m_links[link].m_material->m_kLST); + newLink.setLinkStrength(1.f); + getLinkData().setLinkAt(newLink, firstLink + link); + } + + newSoftBody->setFirstVertex( firstVertex ); + newSoftBody->setFirstTriangle( firstTriangle ); + newSoftBody->setNumVertices( numVertices ); + newSoftBody->setMaxVertices( maxVertices ); + newSoftBody->setNumTriangles( numTriangles ); + newSoftBody->setMaxTriangles( maxTriangles ); + newSoftBody->setFirstLink( firstLink ); + newSoftBody->setNumLinks( numLinks ); + } + + + + updateConstants(0.f); + + + m_linkData.generateBatches(); + m_triangleData.generateBatches(); + + + // Build the shaders to match the batching parameters + buildShaders(); + } +} + + +btSoftBodyLinkData &btOpenCLSoftBodySolverSIMDAware::getLinkData() +{ + // TODO: Consider setting link data to "changed" here + return m_linkData; +} + + + + +void btOpenCLSoftBodySolverSIMDAware::updateConstants( float timeStep ) +{ + + using namespace Vectormath::Aos; + + if( m_updateSolverConstants ) + { + m_updateSolverConstants = false; + + // Will have to redo this if we change the structure (tear, maybe) or various other possible changes + + // Initialise link constants + const int numLinks = m_linkData.getNumLinks(); + for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex ) + { + btSoftBodyLinkData::LinkNodePair &vertices( m_linkData.getVertexPair(linkIndex) ); + m_linkData.getRestLength(linkIndex) = length((m_vertexData.getPosition( vertices.vertex0 ) - m_vertexData.getPosition( vertices.vertex1 ))); + float invMass0 = m_vertexData.getInverseMass(vertices.vertex0); + float invMass1 = m_vertexData.getInverseMass(vertices.vertex1); + float linearStiffness = m_linkData.getLinearStiffnessCoefficient(linkIndex); + float massLSC = (invMass0 + invMass1)/linearStiffness; + m_linkData.getMassLSC(linkIndex) = massLSC; + float restLength = m_linkData.getRestLength(linkIndex); + float restLengthSquared = restLength*restLength; + m_linkData.getRestLengthSquared(linkIndex) = restLengthSquared; + } + } + +} + + + +void btOpenCLSoftBodySolverSIMDAware::solveConstraints( float solverdt ) +{ + + using Vectormath::Aos::Vector3; + using Vectormath::Aos::Point3; + using Vectormath::Aos::lengthSqr; + using Vectormath::Aos::dot; + + // Prepare links + int numLinks = m_linkData.getNumLinks(); + int numVertices = m_vertexData.getNumVertices(); + + float kst = 1.f; + float ti = 0.f; + + + m_clPerClothDampingFactor.moveToGPU(); + m_clPerClothVelocityCorrectionCoefficient.moveToGPU(); + + + // Ensure data is on accelerator + m_linkData.moveToAccelerator(); + m_vertexData.moveToAccelerator(); + + + //prepareLinks(); + + prepareCollisionConstraints(); + + // Solve drift + for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration ) + { + + for( int i = 0; i < m_linkData.m_wavefrontBatchStartLengths.size(); ++i ) + { + int startWave = m_linkData.m_wavefrontBatchStartLengths[i].start; + int numWaves = m_linkData.m_wavefrontBatchStartLengths[i].length; + solveLinksForPosition( startWave, numWaves, kst, ti ); + } + } // for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration ) + + + // At this point assume that the force array is blank - we will overwrite it + solveCollisionsAndUpdateVelocities( 1.f/solverdt ); +} + + +////////////////////////////////////// +// Kernel dispatches + + +void btOpenCLSoftBodySolverSIMDAware::solveLinksForPosition( int startWave, int numWaves, float kst, float ti ) +{ + cl_int ciErrNum; + ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,0, sizeof(int), &startWave); + ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,1, sizeof(int), &numWaves); + ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,2, sizeof(float), &kst); + ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,3, sizeof(float), &ti); + + + ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,4, sizeof(cl_mem), &m_linkData.m_clNumBatchesAndVerticesWithinWaves.m_buffer); + ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,5, sizeof(cl_mem), &m_linkData.m_clWavefrontVerticesGlobalAddresses.m_buffer); + ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,6, sizeof(cl_mem), &m_linkData.m_clLinkVerticesLocalAddresses.m_buffer); + ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,7, sizeof(cl_mem), &m_linkData.m_clLinksMassLSC.m_buffer); + + ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,8, sizeof(cl_mem), &m_linkData.m_clLinksRestLengthSquared.m_buffer); + ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,9, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer); + ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,10, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer); + + ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,11, WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_int2), 0); + ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,12, m_linkData.getMaxVerticesPerWavefront()*WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_float4), 0); + ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,13, m_linkData.getMaxVerticesPerWavefront()*WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_float), 0); + + size_t numWorkItems = workGroupSize*((numWaves*WAVEFRONT_SIZE + (workGroupSize-1)) / workGroupSize); + + ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,solvePositionsFromLinksKernel,1,NULL,&numWorkItems,&workGroupSize,0,0,0); + + if( ciErrNum!= CL_SUCCESS ) + { + btAssert( 0 && "enqueueNDRangeKernel(solvePositionsFromLinksKernel)"); + } + +} // solveLinksForPosition + +void btOpenCLSoftBodySolverSIMDAware::solveCollisionsAndUpdateVelocities( float isolverdt ) +{ + // Copy kernel parameters to GPU + m_vertexData.moveToAccelerator(); + m_clPerClothFriction.moveToGPU(); + m_clPerClothDampingFactor.moveToGPU(); + m_clPerClothCollisionObjects.moveToGPU(); + m_clCollisionObjectDetails.moveToGPU(); + + cl_int ciErrNum; + int numVerts = m_vertexData.getNumVertices(); + ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 0, sizeof(int), &numVerts); + ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 1, sizeof(int), &isolverdt); + ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 2, sizeof(cl_mem),&m_vertexData.m_clClothIdentifier.m_buffer); + ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 3, sizeof(cl_mem),&m_vertexData.m_clVertexPreviousPosition.m_buffer); + ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 4, sizeof(cl_mem),&m_clPerClothFriction.m_buffer); + ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 5, sizeof(cl_mem),&m_clPerClothDampingFactor.m_buffer); + ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 6, sizeof(cl_mem),&m_clPerClothCollisionObjects.m_buffer); + ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 7, sizeof(cl_mem),&m_clCollisionObjectDetails.m_buffer); + ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 8, sizeof(cl_mem),&m_vertexData.m_clVertexForceAccumulator.m_buffer); + ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 9, sizeof(cl_mem),&m_vertexData.m_clVertexVelocity.m_buffer); + ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 10, sizeof(cl_mem),&m_vertexData.m_clVertexPosition.m_buffer); + ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 11, sizeof(CollisionShapeDescription)*16,0); + size_t numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize); + + if (numWorkItems) + { + ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,solveCollisionsAndUpdateVelocitiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0); + + if( ciErrNum != CL_SUCCESS ) + { + btAssert( 0 && "enqueueNDRangeKernel(solveCollisionsAndUpdateVelocitiesKernel)"); + } + } + +} // btOpenCLSoftBodySolverSIMDAware::updateVelocitiesFromPositionsWithoutVelocities + +// End kernel dispatches +///////////////////////////////////// + + + +bool btOpenCLSoftBodySolverSIMDAware::buildShaders() +{ + bool returnVal = true; + + if( m_shadersInitialized ) + return true; + + char *wavefrontMacros = new char[256]; + + sprintf( + wavefrontMacros, + "-DMAX_NUM_VERTICES_PER_WAVE=%d -DMAX_BATCHES_PER_WAVE=%d -DWAVEFRONT_SIZE=%d -DWAVEFRONT_BLOCK_MULTIPLIER=%d -DBLOCK_SIZE=%d", + m_linkData.getMaxVerticesPerWavefront(), + m_linkData.getMaxBatchesPerWavefront(), + m_linkData.getWavefrontSize(), + WAVEFRONT_BLOCK_MULTIPLIER, + WAVEFRONT_BLOCK_MULTIPLIER*m_linkData.getWavefrontSize()); + + updatePositionsFromVelocitiesKernel = clFunctions.compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel", "" ); + solvePositionsFromLinksKernel = clFunctions.compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel", wavefrontMacros ); + updateVelocitiesFromPositionsWithVelocitiesKernel = clFunctions.compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel", "" ); + updateVelocitiesFromPositionsWithoutVelocitiesKernel = clFunctions.compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel", "" ); + integrateKernel = clFunctions.compileCLKernelFromString( IntegrateCLString, "IntegrateKernel", "" ); + applyForcesKernel = clFunctions.compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel", "" ); + solveCollisionsAndUpdateVelocitiesKernel = clFunctions.compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel", "" ); + computeBoundsKernel = clFunctions.compileCLKernelFromString( ComputeBoundsCLString, "ComputeBoundsKernel" ); + + // TODO: Rename to UpdateSoftBodies + resetNormalsAndAreasKernel = clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel", "" ); + normalizeNormalsAndAreasKernel = clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel", "" ); + updateSoftBodiesKernel = clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel", "" ); + + delete [] wavefrontMacros; + + if( returnVal ) + m_shadersInitialized = true; + + return returnVal; +} + + + + +static Vectormath::Aos::Transform3 toTransform3( const btTransform &transform ) +{ + Vectormath::Aos::Transform3 outTransform; + outTransform.setCol(0, toVector3(transform.getBasis().getColumn(0))); + outTransform.setCol(1, toVector3(transform.getBasis().getColumn(1))); + outTransform.setCol(2, toVector3(transform.getBasis().getColumn(2))); + outTransform.setCol(3, toVector3(transform.getOrigin())); + return outTransform; +} + + +static void generateBatchesOfWavefronts( btAlignedObjectArray < btAlignedObjectArray > &linksForWavefronts, btSoftBodyLinkData &linkData, int numVertices, btAlignedObjectArray < btAlignedObjectArray > &wavefrontBatches ) +{ + // A per-batch map of truth values stating whether a given vertex is in that batch + // This allows us to significantly optimize the batching + btAlignedObjectArray > mapOfVerticesInBatches; + + for( int waveIndex = 0; waveIndex < linksForWavefronts.size(); ++waveIndex ) + { + btAlignedObjectArray &wavefront( linksForWavefronts[waveIndex] ); + + int batch = 0; + bool placed = false; + while( batch < wavefrontBatches.size() && !placed ) + { + // Test the current batch, see if this wave shares any vertex with the waves in the batch + bool foundSharedVertex = false; + for( int link = 0; link < wavefront.size(); ++link ) + { + btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] ); + if( (mapOfVerticesInBatches[batch])[vertices.vertex0] || (mapOfVerticesInBatches[batch])[vertices.vertex1] ) + { + foundSharedVertex = true; + } + } + + if( !foundSharedVertex ) + { + wavefrontBatches[batch].push_back( waveIndex ); + // Insert vertices into this batch too + for( int link = 0; link < wavefront.size(); ++link ) + { + btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] ); + (mapOfVerticesInBatches[batch])[vertices.vertex0] = true; + (mapOfVerticesInBatches[batch])[vertices.vertex1] = true; + } + placed = true; + } + batch++; + } + if( batch == wavefrontBatches.size() && !placed ) + { + wavefrontBatches.resize( batch + 1 ); + wavefrontBatches[batch].push_back( waveIndex ); + + // And resize map as well + mapOfVerticesInBatches.resize( batch + 1 ); + + // Resize maps with total number of vertices + mapOfVerticesInBatches[batch].resize( numVertices, false ); + + // Insert vertices into this batch too + for( int link = 0; link < wavefront.size(); ++link ) + { + btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] ); + (mapOfVerticesInBatches[batch])[vertices.vertex0] = true; + (mapOfVerticesInBatches[batch])[vertices.vertex1] = true; + } + } + } + mapOfVerticesInBatches.clear(); +} + +// Function to remove an object from a vector maintaining correct ordering of the vector +template< typename T > static void removeFromVector( btAlignedObjectArray< T > &vectorToUpdate, int indexToRemove ) +{ + int currentSize = vectorToUpdate.size(); + for( int i = indexToRemove; i < (currentSize-1); ++i ) + { + vectorToUpdate[i] = vectorToUpdate[i+1]; + } + if( currentSize > 0 ) + vectorToUpdate.resize( currentSize - 1 ); +} + +/** + * Insert element into vectorToUpdate at index index. + */ +template< typename T > static void insertAtIndex( btAlignedObjectArray< T > &vectorToUpdate, int index, T element ) +{ + vectorToUpdate.resize( vectorToUpdate.size() + 1 ); + for( int i = (vectorToUpdate.size() - 1); i > index; --i ) + { + vectorToUpdate[i] = vectorToUpdate[i-1]; + } + vectorToUpdate[index] = element; +} + +/** + * Insert into btAlignedObjectArray assuming the array is ordered and maintaining both ordering and uniqueness. + * ie it treats vectorToUpdate as an ordered set. + */ +template< typename T > static void insertUniqueAndOrderedIntoVector( btAlignedObjectArray &vectorToUpdate, T element ) +{ + int index = 0; + while( index < vectorToUpdate.size() && vectorToUpdate[index] < element ) + { + index++; + } + if( index == vectorToUpdate.size() || vectorToUpdate[index] != element ) + insertAtIndex( vectorToUpdate, index, element ); +} + +static void generateLinksPerVertex( int numVertices, btSoftBodyLinkData &linkData, btAlignedObjectArray< int > &listOfLinksPerVertex, btAlignedObjectArray &numLinksPerVertex, int &maxLinks ) +{ + for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex ) + { + btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) ); + numLinksPerVertex[nodes.vertex0]++; + numLinksPerVertex[nodes.vertex1]++; + } + int maxLinksPerVertex = 0; + for( int vertexIndex = 0; vertexIndex < numVertices; ++vertexIndex ) + { + maxLinksPerVertex = btMax(numLinksPerVertex[vertexIndex], maxLinksPerVertex); + } + maxLinks = maxLinksPerVertex; + + btAlignedObjectArray< int > linksFoundPerVertex; + linksFoundPerVertex.resize( numVertices, 0 ); + + listOfLinksPerVertex.resize( maxLinksPerVertex * numVertices ); + + for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex ) + { + btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) ); + { + // Do vertex 0 + int vertexIndex = nodes.vertex0; + int linkForVertex = linksFoundPerVertex[nodes.vertex0]; + int linkAddress = vertexIndex * maxLinksPerVertex + linkForVertex; + + listOfLinksPerVertex[linkAddress] = linkIndex; + + linksFoundPerVertex[nodes.vertex0] = linkForVertex + 1; + } + { + // Do vertex 1 + int vertexIndex = nodes.vertex1; + int linkForVertex = linksFoundPerVertex[nodes.vertex1]; + int linkAddress = vertexIndex * maxLinksPerVertex + linkForVertex; + + listOfLinksPerVertex[linkAddress] = linkIndex; + + linksFoundPerVertex[nodes.vertex1] = linkForVertex + 1; + } + } +} + +static void computeBatchingIntoWavefronts( + btSoftBodyLinkData &linkData, + int wavefrontSize, + int linksPerWorkItem, + int maxLinksPerWavefront, + btAlignedObjectArray < btAlignedObjectArray > &linksForWavefronts, + btAlignedObjectArray< btAlignedObjectArray < btAlignedObjectArray > > &batchesWithinWaves, /* wave, batch, links in batch */ + btAlignedObjectArray< btAlignedObjectArray< int > > &verticesForWavefronts /* wavefront, vertex */ + ) +{ + + + // Attempt generation of larger batches of links. + btAlignedObjectArray< bool > processedLink; + processedLink.resize( linkData.getNumLinks() ); + btAlignedObjectArray< int > listOfLinksPerVertex; + int maxLinksPerVertex = 0; + + // Count num vertices + int numVertices = 0; + for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex ) + { + btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) ); + numVertices = btMax( numVertices, nodes.vertex0 + 1 ); + numVertices = btMax( numVertices, nodes.vertex1 + 1 ); + } + + // Need list of links per vertex + // Compute valence of each vertex + btAlignedObjectArray numLinksPerVertex; + numLinksPerVertex.resize(0); + numLinksPerVertex.resize( numVertices, 0 ); + + generateLinksPerVertex( numVertices, linkData, listOfLinksPerVertex, numLinksPerVertex, maxLinksPerVertex ); + + if (!numVertices) + return; + + for( int vertex = 0; vertex < 10; ++vertex ) + { + for( int link = 0; link < numLinksPerVertex[vertex]; ++link ) + { + int linkAddress = vertex * maxLinksPerVertex + link; + } + } + + + // At this point we know what links we have for each vertex so we can start batching + + // We want a vertex to start with, let's go with 0 + int currentVertex = 0; + int linksProcessed = 0; + + btAlignedObjectArray verticesToProcess; + + while( linksProcessed < linkData.getNumLinks() ) + { + // Next wavefront + int nextWavefront = linksForWavefronts.size(); + linksForWavefronts.resize( nextWavefront + 1 ); + btAlignedObjectArray &linksForWavefront(linksForWavefronts[nextWavefront]); + verticesForWavefronts.resize( nextWavefront + 1 ); + btAlignedObjectArray &vertexSet( verticesForWavefronts[nextWavefront] ); + + linksForWavefront.resize(0); + + // Loop to find enough links to fill the wavefront + // Stopping if we either run out of links, or fill it + while( linksProcessed < linkData.getNumLinks() && linksForWavefront.size() < maxLinksPerWavefront ) + { + // Go through the links for the current vertex + for( int link = 0; link < numLinksPerVertex[currentVertex] && linksForWavefront.size() < maxLinksPerWavefront; ++link ) + { + int linkAddress = currentVertex * maxLinksPerVertex + link; + int linkIndex = listOfLinksPerVertex[linkAddress]; + + // If we have not already processed this link, add it to the wavefront + // Claim it as another processed link + // Add the vertex at the far end to the list of vertices to process. + if( !processedLink[linkIndex] ) + { + linksForWavefront.push_back( linkIndex ); + linksProcessed++; + processedLink[linkIndex] = true; + int v0 = linkData.getVertexPair(linkIndex).vertex0; + int v1 = linkData.getVertexPair(linkIndex).vertex1; + if( v0 == currentVertex ) + verticesToProcess.push_back( v1 ); + else + verticesToProcess.push_back( v0 ); + } + } + if( verticesToProcess.size() > 0 ) + { + // Get the element on the front of the queue and remove it + currentVertex = verticesToProcess[0]; + removeFromVector( verticesToProcess, 0 ); + } else { + // If we've not yet processed all the links, find the first unprocessed one + // and select one of its vertices as the current vertex + if( linksProcessed < linkData.getNumLinks() ) + { + int searchLink = 0; + while( processedLink[searchLink] ) + searchLink++; + currentVertex = linkData.getVertexPair(searchLink).vertex0; + } + } + } + + // We have either finished or filled a wavefront + for( int link = 0; link < linksForWavefront.size(); ++link ) + { + int v0 = linkData.getVertexPair( linksForWavefront[link] ).vertex0; + int v1 = linkData.getVertexPair( linksForWavefront[link] ).vertex1; + insertUniqueAndOrderedIntoVector( vertexSet, v0 ); + insertUniqueAndOrderedIntoVector( vertexSet, v1 ); + } + // Iterate over links mapped to the wave and batch those + // We can run a batch on each cycle trivially + + batchesWithinWaves.resize( batchesWithinWaves.size() + 1 ); + btAlignedObjectArray < btAlignedObjectArray > &batchesWithinWave( batchesWithinWaves[batchesWithinWaves.size()-1] ); + + + for( int link = 0; link < linksForWavefront.size(); ++link ) + { + int linkIndex = linksForWavefront[link]; + btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( linkIndex ); + + int batch = 0; + bool placed = false; + while( batch < batchesWithinWave.size() && !placed ) + { + bool foundSharedVertex = false; + if( batchesWithinWave[batch].size() >= wavefrontSize ) + { + // If we have already filled this batch, move on to another + foundSharedVertex = true; + } else { + for( int link2 = 0; link2 < batchesWithinWave[batch].size(); ++link2 ) + { + btSoftBodyLinkData::LinkNodePair vertices2 = linkData.getVertexPair( (batchesWithinWave[batch])[link2] ); + + if( vertices.vertex0 == vertices2.vertex0 || + vertices.vertex1 == vertices2.vertex0 || + vertices.vertex0 == vertices2.vertex1 || + vertices.vertex1 == vertices2.vertex1 ) + { + foundSharedVertex = true; + break; + } + } + } + if( !foundSharedVertex ) + { + batchesWithinWave[batch].push_back( linkIndex ); + placed = true; + } else { + ++batch; + } + } + if( batch == batchesWithinWave.size() && !placed ) + { + batchesWithinWave.resize( batch + 1 ); + batchesWithinWave[batch].push_back( linkIndex ); + } + } + + } + +} + +void btSoftBodyLinkDataOpenCLSIMDAware::generateBatches() +{ + btAlignedObjectArray < btAlignedObjectArray > linksForWavefronts; + btAlignedObjectArray < btAlignedObjectArray > wavefrontBatches; + btAlignedObjectArray< btAlignedObjectArray < btAlignedObjectArray > > batchesWithinWaves; + btAlignedObjectArray< btAlignedObjectArray< int > > verticesForWavefronts; // wavefronts, vertices in wavefront as an ordered set + + // Group the links into wavefronts + computeBatchingIntoWavefronts( *this, m_wavefrontSize, m_linksPerWorkItem, m_maxLinksPerWavefront, linksForWavefronts, batchesWithinWaves, verticesForWavefronts ); + + + // Batch the wavefronts + generateBatchesOfWavefronts( linksForWavefronts, *this, m_maxVertex, wavefrontBatches ); + + m_numWavefronts = linksForWavefronts.size(); + + // At this point we have a description of which links we need to process in each wavefront + + // First correctly fill the batch ranges vector + int numBatches = wavefrontBatches.size(); + m_wavefrontBatchStartLengths.resize(0); + int prefixSum = 0; + for( int batchIndex = 0; batchIndex < numBatches; ++batchIndex ) + { + int wavesInBatch = wavefrontBatches[batchIndex].size(); + int nextPrefixSum = prefixSum + wavesInBatch; + m_wavefrontBatchStartLengths.push_back( BatchPair( prefixSum, nextPrefixSum - prefixSum ) ); + + prefixSum += wavesInBatch; + } + + // Also find max number of batches within a wave + m_maxBatchesWithinWave = 0; + m_maxVerticesWithinWave = 0; + m_numBatchesAndVerticesWithinWaves.resize( m_numWavefronts ); + for( int waveIndex = 0; waveIndex < m_numWavefronts; ++waveIndex ) + { + // See if the number of batches in this wave is greater than the current maxium + int batchesInCurrentWave = batchesWithinWaves[waveIndex].size(); + int verticesInCurrentWave = verticesForWavefronts[waveIndex].size(); + m_maxBatchesWithinWave = btMax( batchesInCurrentWave, m_maxBatchesWithinWave ); + m_maxVerticesWithinWave = btMax( verticesInCurrentWave, m_maxVerticesWithinWave ); + } + + // Add padding values both for alignment and as dudd addresses within LDS to compute junk rather than branch around + m_maxVerticesWithinWave = 16*((m_maxVerticesWithinWave/16)+2); + + // Now we know the maximum number of vertices per-wave we can resize the global vertices array + m_wavefrontVerticesGlobalAddresses.resize( m_maxVerticesWithinWave * m_numWavefronts ); + + // Grab backup copies of all the link data arrays for the sorting process + btAlignedObjectArray m_links_Backup(m_links); + btAlignedObjectArray m_linkStrength_Backup(m_linkStrength); + btAlignedObjectArray m_linksMassLSC_Backup(m_linksMassLSC); + btAlignedObjectArray m_linksRestLengthSquared_Backup(m_linksRestLengthSquared); + //btAlignedObjectArray m_linksCLength_Backup(m_linksCLength); + //btAlignedObjectArray m_linksLengthRatio_Backup(m_linksLengthRatio); + btAlignedObjectArray m_linksRestLength_Backup(m_linksRestLength); + btAlignedObjectArray m_linksMaterialLinearStiffnessCoefficient_Backup(m_linksMaterialLinearStiffnessCoefficient); + + // Resize to a wavefront sized batch per batch per wave so we get perfectly coherent memory accesses. + m_links.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts ); + m_linkVerticesLocalAddresses.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts ); + m_linkStrength.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts ); + m_linksMassLSC.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts ); + m_linksRestLengthSquared.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts ); + m_linksRestLength.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts ); + m_linksMaterialLinearStiffnessCoefficient.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts ); + + // Then re-order links into wavefront blocks + + // Total number of wavefronts moved. This will decide the ordering of sorted wavefronts. + int wavefrontCount = 0; + + // Iterate over batches of wavefronts, then wavefronts in the batch + for( int batchIndex = 0; batchIndex < numBatches; ++batchIndex ) + { + btAlignedObjectArray &batch( wavefrontBatches[batchIndex] ); + int wavefrontsInBatch = batch.size(); + + + for( int wavefrontIndex = 0; wavefrontIndex < wavefrontsInBatch; ++wavefrontIndex ) + { + + int originalWavefrontIndex = batch[wavefrontIndex]; + btAlignedObjectArray< int > &wavefrontVertices( verticesForWavefronts[originalWavefrontIndex] ); + int verticesUsedByWavefront = wavefrontVertices.size(); + + // Copy the set of vertices into the correctly structured array for use on the device + // Fill the non-vertices with -1s + // so we can mask out those reads + for( int vertex = 0; vertex < verticesUsedByWavefront; ++vertex ) + { + m_wavefrontVerticesGlobalAddresses[m_maxVerticesWithinWave * wavefrontCount + vertex] = wavefrontVertices[vertex]; + } + for( int vertex = verticesUsedByWavefront; vertex < m_maxVerticesWithinWave; ++vertex ) + { + m_wavefrontVerticesGlobalAddresses[m_maxVerticesWithinWave * wavefrontCount + vertex] = -1; + } + + // Obtain the set of batches within the current wavefront + btAlignedObjectArray < btAlignedObjectArray > &batchesWithinWavefront( batchesWithinWaves[originalWavefrontIndex] ); + // Set the size of the batches for use in the solver, correctly ordered + NumBatchesVerticesPair batchesAndVertices; + batchesAndVertices.numBatches = batchesWithinWavefront.size(); + batchesAndVertices.numVertices = verticesUsedByWavefront; + m_numBatchesAndVerticesWithinWaves[wavefrontCount] = batchesAndVertices; + + + // Now iterate over batches within the wavefront to structure the links correctly + for( int wavefrontBatch = 0; wavefrontBatch < batchesWithinWavefront.size(); ++wavefrontBatch ) + { + btAlignedObjectArray &linksInBatch( batchesWithinWavefront[wavefrontBatch] ); + int wavefrontBatchSize = linksInBatch.size(); + + int batchAddressInTarget = m_maxBatchesWithinWave * m_wavefrontSize * wavefrontCount + m_wavefrontSize * wavefrontBatch; + + for( int linkIndex = 0; linkIndex < wavefrontBatchSize; ++linkIndex ) + { + int originalLinkAddress = linksInBatch[linkIndex]; + // Reorder simple arrays trivially + m_links[batchAddressInTarget + linkIndex] = m_links_Backup[originalLinkAddress]; + m_linkStrength[batchAddressInTarget + linkIndex] = m_linkStrength_Backup[originalLinkAddress]; + m_linksMassLSC[batchAddressInTarget + linkIndex] = m_linksMassLSC_Backup[originalLinkAddress]; + m_linksRestLengthSquared[batchAddressInTarget + linkIndex] = m_linksRestLengthSquared_Backup[originalLinkAddress]; + m_linksRestLength[batchAddressInTarget + linkIndex] = m_linksRestLength_Backup[originalLinkAddress]; + m_linksMaterialLinearStiffnessCoefficient[batchAddressInTarget + linkIndex] = m_linksMaterialLinearStiffnessCoefficient_Backup[originalLinkAddress]; + + // The local address is more complicated. We need to work out where a given vertex will end up + // by searching the set of vertices for this link and using the index as the local address + btSoftBodyLinkData::LinkNodePair localPair; + btSoftBodyLinkData::LinkNodePair globalPair = m_links[batchAddressInTarget + linkIndex]; + localPair.vertex0 = wavefrontVertices.findLinearSearch( globalPair.vertex0 ); + localPair.vertex1 = wavefrontVertices.findLinearSearch( globalPair.vertex1 ); + m_linkVerticesLocalAddresses[batchAddressInTarget + linkIndex] = localPair; + } + for( int linkIndex = wavefrontBatchSize; linkIndex < m_wavefrontSize; ++linkIndex ) + { + // Put 0s into these arrays for padding for cleanliness + m_links[batchAddressInTarget + linkIndex] = btSoftBodyLinkData::LinkNodePair(0, 0); + m_linkStrength[batchAddressInTarget + linkIndex] = 0.f; + m_linksMassLSC[batchAddressInTarget + linkIndex] = 0.f; + m_linksRestLengthSquared[batchAddressInTarget + linkIndex] = 0.f; + m_linksRestLength[batchAddressInTarget + linkIndex] = 0.f; + m_linksMaterialLinearStiffnessCoefficient[batchAddressInTarget + linkIndex] = 0.f; + + + // For local addresses of junk data choose a set of addresses just above the range of valid ones + // and cycling tyhrough % 16 so that we don't have bank conficts between all dud addresses + // The valid addresses will do scatter and gather in the valid range, the junk ones should happily work + // off the end of that range so we need no control + btSoftBodyLinkData::LinkNodePair localPair; + localPair.vertex0 = verticesUsedByWavefront + (linkIndex % 16); + localPair.vertex1 = verticesUsedByWavefront + (linkIndex % 16); + m_linkVerticesLocalAddresses[batchAddressInTarget + linkIndex] = localPair; + } + + } + + + wavefrontCount++; + } + + + } + +} // void btSoftBodyLinkDataDX11SIMDAware::generateBatches() + + + diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h new file mode 100644 index 000000000..db4f2ae8b --- /dev/null +++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.h @@ -0,0 +1,82 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef BT_SOFT_BODY_SOLVER_OPENCL_SIMDAWARE_H +#define BT_SOFT_BODY_SOLVER_OPENCL_SIMDAWARE_H + +#include "stddef.h" //for size_t +#include "vectormath/vmInclude.h" + +#include "btSoftBodySolver_OpenCL.h" +#include "btSoftBodySolverBuffer_OpenCL.h" +#include "btSoftBodySolverLinkData_OpenCLSIMDAware.h" +#include "btSoftBodySolverVertexData_OpenCL.h" +#include "btSoftBodySolverTriangleData_OpenCL.h" + + + + + +class btOpenCLSoftBodySolverSIMDAware : public btOpenCLSoftBodySolver +{ +protected: + + + btSoftBodyLinkDataOpenCLSIMDAware m_linkData; + + + bool m_shadersInitialized; + + + bool buildShaders(); + + + void updateConstants( float timeStep ); + + float computeTriangleArea( + const Vectormath::Aos::Point3 &vertex0, + const Vectormath::Aos::Point3 &vertex1, + const Vectormath::Aos::Point3 &vertex2 ); + + + ////////////////////////////////////// + // Kernel dispatches + void solveLinksForPosition( int startLink, int numLinks, float kst, float ti ); + + void solveCollisionsAndUpdateVelocities( float isolverdt ); + // End kernel dispatches + ///////////////////////////////////// + +public: + btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue,cl_context ctx); + + virtual ~btOpenCLSoftBodySolverSIMDAware(); + + virtual SolverTypes getSolverType() const + { + return CL_SIMD_SOLVER; + } + + + virtual btSoftBodyLinkData &getLinkData(); + + + virtual void optimize( btAlignedObjectArray< btSoftBody * > &softBodies , bool forceUpdate=false); + + virtual void solveConstraints( float solverdt ); + +}; // btOpenCLSoftBodySolverSIMDAware + +#endif // #ifndef BT_SOFT_BODY_SOLVER_OPENCL_SIMDAWARE_H