Added SSE4/FMA optimized constraint row solver implementation for btSequentialImpulseConstraintSolver,

thanks to Vladimir Bondarev (https://github.com/VladimirBondarev/bullet3/tree/c25d)
This commit is contained in:
Erwin Coumans
2014-05-01 17:13:50 -07:00
parent 7151865c16
commit 0e1b90d708
5 changed files with 301 additions and 69 deletions

View File

@@ -10,7 +10,7 @@ IF (USE_GLUT)
# ENDIF() # ENDIF()
SET(SharedDemoSubdirs SET(SharedDemoSubdirs
OpenGL OpenGL
CcdPhysicsDemo SliderConstraintDemo GenericJointDemo Raytracer CcdPhysicsDemo ConstraintDemo SliderConstraintDemo GenericJointDemo Raytracer
RagdollDemo ForkLiftDemo BasicDemo FeatherstoneMultiBodyDemo RollingFrictionDemo RaytestDemo VoronoiFractureDemo RagdollDemo ForkLiftDemo BasicDemo FeatherstoneMultiBodyDemo RollingFrictionDemo RaytestDemo VoronoiFractureDemo
GyroscopicDemo FractureDemo Box2dDemo BspDemo MovingConcaveDemo VehicleDemo GyroscopicDemo FractureDemo Box2dDemo BspDemo MovingConcaveDemo VehicleDemo
UserCollisionAlgorithm CharacterDemo SoftDemo UserCollisionAlgorithm CharacterDemo SoftDemo

View File

@@ -14,14 +14,14 @@
INCLUDE_DIRECTORIES( INCLUDE_DIRECTORIES(
${BULLET_PHYSICS_SOURCE_DIR}/src ${BULLET_PHYSICS_SOURCE_DIR}/src
../OpenGL ../OpenGL
${BULLET_PHYSICS_SOURCE_DIR}/Extras/Serialize/BulletFileLoader ${GLUT_INCLUDE_DIR}
${BULLET_PHYSICS_SOURCE_DIR}/Extras/Serialize/BulletWorldImporter
) )
IF (USE_GLUT) IF (USE_GLUT)
LINK_LIBRARIES( LINK_LIBRARIES(
OpenGLSupport BulletWorldImporter BulletDynamics BulletCollision LinearMath BulletFileLoader ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} OpenGLSupport BulletDynamics BulletCollision LinearMath ${GLUT_glut_LIBRARY}
${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
) )
ADD_EXECUTABLE(AppConstraintDemo ADD_EXECUTABLE(AppConstraintDemo
@@ -31,7 +31,7 @@ IF (USE_GLUT)
) )
ELSE (USE_GLUT) ELSE (USE_GLUT)
LINK_LIBRARIES( LINK_LIBRARIES(
OpenGLSupport BulletWorldImporter BulletDynamics BulletCollision LinearMath BulletFileLoader ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} OpenGLSupport BulletDynamics BulletCollision LinearMath ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
) )
ADD_EXECUTABLE(AppConstraintDemo ADD_EXECUTABLE(AppConstraintDemo

View File

@@ -37,57 +37,164 @@ int gNumSplitImpulseRecoveries = 0;
#include "BulletDynamics/Dynamics/btRigidBody.h" #include "BulletDynamics/Dynamics/btRigidBody.h"
btSequentialImpulseConstraintSolver::btSequentialImpulseConstraintSolver()
:m_btSeed2(0)
{
}
btSequentialImpulseConstraintSolver::~btSequentialImpulseConstraintSolver()
{
}
#ifdef USE_SIMD #ifdef USE_SIMD
#include <emmintrin.h> #include <intrin.h>//is it safe to include this header?
#define btVecSplat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e,e,e,e)) #define btVecSplat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e,e,e,e))
static inline __m128 btSimdDot3( __m128 vec0, __m128 vec1 ) static inline __m128 btSimdDot3( __m128 vec0, __m128 vec1 )
{ {
__m128 result = _mm_mul_ps( vec0, vec1); __m128 result = _mm_mul_ps( vec0, vec1);
return _mm_add_ps( btVecSplat( result, 0 ), _mm_add_ps( btVecSplat( result, 1 ), btVecSplat( result, 2 ) ) ); return _mm_add_ps( btVecSplat( result, 0 ), _mm_add_ps( btVecSplat( result, 1 ), btVecSplat( result, 2 ) ) );
} }
#endif//USE_SIMD
#define USE_FMA 1
#define USE_FMA3_INSTEAD_FMA4 1
#define USE_SSE4_DOT 0
#define SSE4_DP(a, b) _mm_dp_ps(a, b, 0x7f)
#define SSE4_DP_FP(a, b) _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7f))
#if USE_SSE4_DOT
#define DOT_PRODUCT(a, b) SSE4_DP(a, b)
#else
#define DOT_PRODUCT(a, b) btSimdDot3(a, b)
#endif
#if USE_FMA
#if USE_FMA3_INSTEAD_FMA4
// a*b + c
#define FMADD(a, b, c) _mm_fmadd_ps(a, b, c)
// -(a*b) + c
#define FMNADD(a, b, c) _mm_fnmadd_ps(a, b, c)
#else // USE_FMA3
// a*b + c
#define FMADD(a, b, c) _mm_macc_ps(a, b, c)
// -(a*b) + c
#define FMNADD(a, b, c) _mm_nmacc_ps(a, b, c)
#endif
#else // USE_FMA
// c + a*b
#define FMADD(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b))
// c - a*b
#define FMNADD(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b))
#endif
// Project Gauss Seidel or the equivalent Sequential Impulse // Project Gauss Seidel or the equivalent Sequential Impulse
btSimdScalar btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGenericSIMD(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c) static btSimdScalar gResolveSingleConstraintRowGeneric_sse2(btSolverBody& body1, btSolverBody& body2, const btSolverConstraint& c)
{ {
#ifdef USE_SIMD
__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse); __m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse);
__m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit); __m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
__m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit); __m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit);
btSimdScalar deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse),_mm_set1_ps(c.m_cfm))); btSimdScalar deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse), _mm_set1_ps(c.m_cfm)));
__m128 deltaVel1Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal1.mVec128,body1.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128,body1.internalGetDeltaAngularVelocity().mVec128)); __m128 deltaVel1Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal1.mVec128, body1.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128, body1.internalGetDeltaAngularVelocity().mVec128));
__m128 deltaVel2Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal2.mVec128,body2.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos2CrossNormal.mVec128,body2.internalGetDeltaAngularVelocity().mVec128)); __m128 deltaVel2Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal2.mVec128, body2.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos2CrossNormal.mVec128, body2.internalGetDeltaAngularVelocity().mVec128));
deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv))); deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel1Dotn, _mm_set1_ps(c.m_jacDiagABInv)));
deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv))); deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel2Dotn, _mm_set1_ps(c.m_jacDiagABInv)));
btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse); btSimdScalar sum = _mm_add_ps(cpAppliedImp, deltaImpulse);
btSimdScalar resultLowerLess,resultUpperLess; btSimdScalar resultLowerLess, resultUpperLess;
resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1); resultLowerLess = _mm_cmplt_ps(sum, lowerLimit1);
resultUpperLess = _mm_cmplt_ps(sum,upperLimit1); resultUpperLess = _mm_cmplt_ps(sum, upperLimit1);
__m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp); __m128 lowMinApplied = _mm_sub_ps(lowerLimit1, cpAppliedImp);
deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) ); deltaImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse));
c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) ); c.m_appliedImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum));
__m128 upperMinApplied = _mm_sub_ps(upperLimit1,cpAppliedImp); __m128 upperMinApplied = _mm_sub_ps(upperLimit1, cpAppliedImp);
deltaImpulse = _mm_or_ps( _mm_and_ps(resultUpperLess, deltaImpulse), _mm_andnot_ps(resultUpperLess, upperMinApplied) ); deltaImpulse = _mm_or_ps(_mm_and_ps(resultUpperLess, deltaImpulse), _mm_andnot_ps(resultUpperLess, upperMinApplied));
c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultUpperLess, c.m_appliedImpulse), _mm_andnot_ps(resultUpperLess, upperLimit1) ); c.m_appliedImpulse = _mm_or_ps(_mm_and_ps(resultUpperLess, c.m_appliedImpulse), _mm_andnot_ps(resultUpperLess, upperLimit1));
__m128 linearComponentA = _mm_mul_ps(c.m_contactNormal1.mVec128,body1.internalGetInvMass().mVec128); __m128 linearComponentA = _mm_mul_ps(c.m_contactNormal1.mVec128, body1.internalGetInvMass().mVec128);
__m128 linearComponentB = _mm_mul_ps((c.m_contactNormal2).mVec128,body2.internalGetInvMass().mVec128); __m128 linearComponentB = _mm_mul_ps((c.m_contactNormal2).mVec128, body2.internalGetInvMass().mVec128);
__m128 impulseMagnitude = deltaImpulse; __m128 impulseMagnitude = deltaImpulse;
body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude)); body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentA, impulseMagnitude));
body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude)); body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentA.mVec128, impulseMagnitude));
body2.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude)); body2.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentB, impulseMagnitude));
body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude)); body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentB.mVec128, impulseMagnitude));
return deltaImpulse; return deltaImpulse;
}
// Enhanced version of gResolveSingleConstraintRowGeneric_sse2 with SSE4.1 and FMA3
static btSimdScalar gResolveSingleConstraintRowGeneric_sse4_1_fma3(btSolverBody& body1, btSolverBody& body2, const btSolverConstraint& c)
{
__m128 tmp = _mm_set_ps1(c.m_jacDiagABInv);
__m128 deltaImpulse = _mm_set_ps1(c.m_rhs - btScalar(c.m_appliedImpulse)*c.m_cfm);
const __m128 lowerLimit = _mm_set_ps1(c.m_lowerLimit);
const __m128 upperLimit = _mm_set_ps1(c.m_upperLimit);
const __m128 deltaVel1Dotn = _mm_add_ps(DOT_PRODUCT(c.m_contactNormal1.mVec128, body1.internalGetDeltaLinearVelocity().mVec128), DOT_PRODUCT(c.m_relpos1CrossNormal.mVec128, body1.internalGetDeltaAngularVelocity().mVec128));
const __m128 deltaVel2Dotn = _mm_add_ps(DOT_PRODUCT(c.m_contactNormal2.mVec128, body2.internalGetDeltaLinearVelocity().mVec128), DOT_PRODUCT(c.m_relpos2CrossNormal.mVec128, body2.internalGetDeltaAngularVelocity().mVec128));
deltaImpulse = FMNADD(deltaVel1Dotn, tmp, deltaImpulse);
deltaImpulse = FMNADD(deltaVel2Dotn, tmp, deltaImpulse);
tmp = _mm_add_ps(c.m_appliedImpulse, deltaImpulse); // sum
const __m128 maskLower = _mm_cmpgt_ps(tmp, lowerLimit);
const __m128 maskUpper = _mm_cmpgt_ps(upperLimit, tmp);
deltaImpulse = _mm_blendv_ps(_mm_sub_ps(lowerLimit, c.m_appliedImpulse), _mm_blendv_ps(_mm_sub_ps(upperLimit, c.m_appliedImpulse), deltaImpulse, maskUpper), maskLower);
c.m_appliedImpulse = _mm_blendv_ps(lowerLimit, _mm_blendv_ps(upperLimit, tmp, maskUpper), maskLower);
body1.internalGetDeltaLinearVelocity().mVec128 = FMADD(_mm_mul_ps(c.m_contactNormal1.mVec128, body1.internalGetInvMass().mVec128), deltaImpulse, body1.internalGetDeltaLinearVelocity().mVec128);
body1.internalGetDeltaAngularVelocity().mVec128 = FMADD(c.m_angularComponentA.mVec128, deltaImpulse, body1.internalGetDeltaAngularVelocity().mVec128);
body2.internalGetDeltaLinearVelocity().mVec128 = FMADD(_mm_mul_ps(c.m_contactNormal2.mVec128, body2.internalGetInvMass().mVec128), deltaImpulse, body2.internalGetDeltaLinearVelocity().mVec128);
body2.internalGetDeltaAngularVelocity().mVec128 = FMADD(c.m_angularComponentB.mVec128, deltaImpulse, body2.internalGetDeltaAngularVelocity().mVec128);
return deltaImpulse;
}
static btSimdScalar gResolveSingleConstraintRowLowerLimit_sse2(btSolverBody& body1, btSolverBody& body2, const btSolverConstraint& c)
{
__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse);
__m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
__m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit);
btSimdScalar deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse), _mm_set1_ps(c.m_cfm)));
__m128 deltaVel1Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal1.mVec128, body1.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128, body1.internalGetDeltaAngularVelocity().mVec128));
__m128 deltaVel2Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal2.mVec128, body2.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos2CrossNormal.mVec128, body2.internalGetDeltaAngularVelocity().mVec128));
deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel1Dotn, _mm_set1_ps(c.m_jacDiagABInv)));
deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel2Dotn, _mm_set1_ps(c.m_jacDiagABInv)));
btSimdScalar sum = _mm_add_ps(cpAppliedImp, deltaImpulse);
btSimdScalar resultLowerLess, resultUpperLess;
resultLowerLess = _mm_cmplt_ps(sum, lowerLimit1);
resultUpperLess = _mm_cmplt_ps(sum, upperLimit1);
__m128 lowMinApplied = _mm_sub_ps(lowerLimit1, cpAppliedImp);
deltaImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse));
c.m_appliedImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum));
__m128 linearComponentA = _mm_mul_ps(c.m_contactNormal1.mVec128, body1.internalGetInvMass().mVec128);
__m128 linearComponentB = _mm_mul_ps(c.m_contactNormal2.mVec128, body2.internalGetInvMass().mVec128);
__m128 impulseMagnitude = deltaImpulse;
body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentA, impulseMagnitude));
body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentA.mVec128, impulseMagnitude));
body2.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentB, impulseMagnitude));
body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentB.mVec128, impulseMagnitude));
return deltaImpulse;
}
// Enhanced version of gResolveSingleConstraintRowGeneric_sse2 with SSE4.1 and FMA3
static btSimdScalar gResolveSingleConstraintRowLowerLimit_sse4_1_fma3(btSolverBody& body1, btSolverBody& body2, const btSolverConstraint& c)
{
__m128 tmp = _mm_set_ps1(c.m_jacDiagABInv);
__m128 deltaImpulse = _mm_set_ps1(c.m_rhs - btScalar(c.m_appliedImpulse)*c.m_cfm);
const __m128 lowerLimit = _mm_set_ps1(c.m_lowerLimit);
const __m128 deltaVel1Dotn = _mm_add_ps(DOT_PRODUCT(c.m_contactNormal1.mVec128, body1.internalGetDeltaLinearVelocity().mVec128), DOT_PRODUCT(c.m_relpos1CrossNormal.mVec128, body1.internalGetDeltaAngularVelocity().mVec128));
const __m128 deltaVel2Dotn = _mm_add_ps(DOT_PRODUCT(c.m_contactNormal2.mVec128, body2.internalGetDeltaLinearVelocity().mVec128), DOT_PRODUCT(c.m_relpos2CrossNormal.mVec128, body2.internalGetDeltaAngularVelocity().mVec128));
deltaImpulse = FMNADD(deltaVel1Dotn, tmp, deltaImpulse);
deltaImpulse = FMNADD(deltaVel2Dotn, tmp, deltaImpulse);
tmp = _mm_add_ps(c.m_appliedImpulse, deltaImpulse);
const __m128 mask = _mm_cmpgt_ps(tmp, lowerLimit);
deltaImpulse = _mm_blendv_ps(_mm_sub_ps(lowerLimit, c.m_appliedImpulse), deltaImpulse, mask);
c.m_appliedImpulse = _mm_blendv_ps(lowerLimit, tmp, mask);
body1.internalGetDeltaLinearVelocity().mVec128 = FMADD(_mm_mul_ps(c.m_contactNormal1.mVec128, body1.internalGetInvMass().mVec128), deltaImpulse, body1.internalGetDeltaLinearVelocity().mVec128);
body1.internalGetDeltaAngularVelocity().mVec128 = FMADD(c.m_angularComponentA.mVec128, deltaImpulse, body1.internalGetDeltaAngularVelocity().mVec128);
body2.internalGetDeltaLinearVelocity().mVec128 = FMADD(_mm_mul_ps(c.m_contactNormal2.mVec128, body2.internalGetInvMass().mVec128), deltaImpulse, body2.internalGetDeltaLinearVelocity().mVec128);
body2.internalGetDeltaAngularVelocity().mVec128 = FMADD(c.m_angularComponentB.mVec128, deltaImpulse, body2.internalGetDeltaAngularVelocity().mVec128);
return deltaImpulse;
}
#endif //USE_SIMD
btSimdScalar btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGenericSIMD(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c)
{
#ifdef USE_SIMD
return m_resolveSingleConstraintRowGeneric(body1, body2, c);
#else #else
return resolveSingleConstraintRowGeneric(body1,body2,c); return resolveSingleConstraintRowGeneric(body1,body2,c);
#endif #endif
@@ -129,29 +236,7 @@ btSimdScalar btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGene
btSimdScalar btSequentialImpulseConstraintSolver::resolveSingleConstraintRowLowerLimitSIMD(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c) btSimdScalar btSequentialImpulseConstraintSolver::resolveSingleConstraintRowLowerLimitSIMD(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c)
{ {
#ifdef USE_SIMD #ifdef USE_SIMD
__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse); return m_resolveSingleConstraintRowLowerLimit(body1, body2, c);
__m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
__m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit);
btSimdScalar deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse),_mm_set1_ps(c.m_cfm)));
__m128 deltaVel1Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal1.mVec128,body1.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128,body1.internalGetDeltaAngularVelocity().mVec128));
__m128 deltaVel2Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal2.mVec128,body2.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos2CrossNormal.mVec128,body2.internalGetDeltaAngularVelocity().mVec128));
deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv)));
btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse);
btSimdScalar resultLowerLess,resultUpperLess;
resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1);
resultUpperLess = _mm_cmplt_ps(sum,upperLimit1);
__m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp);
deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) );
c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) );
__m128 linearComponentA = _mm_mul_ps(c.m_contactNormal1.mVec128,body1.internalGetInvMass().mVec128);
__m128 linearComponentB = _mm_mul_ps(c.m_contactNormal2.mVec128,body2.internalGetInvMass().mVec128);
__m128 impulseMagnitude = deltaImpulse;
body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude));
body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude));
body2.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude));
body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude));
return deltaImpulse;
#else #else
return resolveSingleConstraintRowLowerLimit(body1,body2,c); return resolveSingleConstraintRowLowerLimit(body1,body2,c);
#endif #endif
@@ -248,6 +333,36 @@ void btSequentialImpulseConstraintSolver::resolveSplitPenetrationImpulseCacheFri
} }
btSequentialImpulseConstraintSolver::btSequentialImpulseConstraintSolver()
:m_btSeed2(0),
m_resolveSingleConstraintRowGeneric(gResolveSingleConstraintRowGeneric_sse2),
m_resolveSingleConstraintRowLowerLimit(gResolveSingleConstraintRowLowerLimit_sse2)
{
}
btSequentialImpulseConstraintSolver::~btSequentialImpulseConstraintSolver()
{
}
btSingleConstraintRowSolver btSequentialImpulseConstraintSolver::getSSE2ConstraintRowSolverGeneric()
{
return gResolveSingleConstraintRowGeneric_sse2;
}
btSingleConstraintRowSolver btSequentialImpulseConstraintSolver::getSSE4_1ConstraintRowSolverGeneric()
{
return gResolveSingleConstraintRowGeneric_sse4_1_fma3;
}
btSingleConstraintRowSolver btSequentialImpulseConstraintSolver::getSSE2ConstraintRowSolverLowerLimit()
{
return gResolveSingleConstraintRowLowerLimit_sse2;
}
btSingleConstraintRowSolver btSequentialImpulseConstraintSolver::getSSE4_1ConstraintRowSolverLowerLimit()
{
return gResolveSingleConstraintRowLowerLimit_sse4_1_fma3;
}
unsigned long btSequentialImpulseConstraintSolver::btRand2() unsigned long btSequentialImpulseConstraintSolver::btRand2()
{ {
@@ -434,8 +549,8 @@ void btSequentialImpulseConstraintSolver::setupFrictionConstraint(btSolverConstr
// btScalar positionalError = 0.f; // btScalar positionalError = 0.f;
btScalar velocityError = desiredVelocity - rel_vel; btSimdScalar velocityError = desiredVelocity - rel_vel;
btScalar velocityImpulse = velocityError * solverConstraint.m_jacDiagABInv; btSimdScalar velocityImpulse = velocityError * btSimdScalar(solverConstraint.m_jacDiagABInv);
solverConstraint.m_rhs = velocityImpulse; solverConstraint.m_rhs = velocityImpulse;
solverConstraint.m_rhsPenetration = 0.f; solverConstraint.m_rhsPenetration = 0.f;
solverConstraint.m_cfm = cfmSlip; solverConstraint.m_cfm = cfmSlip;
@@ -1741,5 +1856,3 @@ void btSequentialImpulseConstraintSolver::reset()
{ {
m_btSeed2 = 0; m_btSeed2 = 0;
} }

View File

@@ -27,6 +27,8 @@ class btCollisionObject;
#include "BulletCollision/NarrowPhaseCollision/btManifoldPoint.h" #include "BulletCollision/NarrowPhaseCollision/btManifoldPoint.h"
#include "BulletDynamics/ConstraintSolver/btConstraintSolver.h" #include "BulletDynamics/ConstraintSolver/btConstraintSolver.h"
typedef btSimdScalar(*btSingleConstraintRowSolver)(btSolverBody&, btSolverBody&, const btSolverConstraint&);
///The btSequentialImpulseConstraintSolver is a fast SIMD implementation of the Projected Gauss Seidel (iterative LCP) method. ///The btSequentialImpulseConstraintSolver is a fast SIMD implementation of the Projected Gauss Seidel (iterative LCP) method.
ATTRIBUTE_ALIGNED16(class) btSequentialImpulseConstraintSolver : public btConstraintSolver ATTRIBUTE_ALIGNED16(class) btSequentialImpulseConstraintSolver : public btConstraintSolver
{ {
@@ -43,6 +45,10 @@ protected:
btAlignedObjectArray<btTypedConstraint::btConstraintInfo1> m_tmpConstraintSizesPool; btAlignedObjectArray<btTypedConstraint::btConstraintInfo1> m_tmpConstraintSizesPool;
int m_maxOverrideNumSolverIterations; int m_maxOverrideNumSolverIterations;
int m_fixedBodyId; int m_fixedBodyId;
btSingleConstraintRowSolver m_resolveSingleConstraintRowGeneric;
btSingleConstraintRowSolver m_resolveSingleConstraintRowLowerLimit;
void setupFrictionConstraint( btSolverConstraint& solverConstraint, const btVector3& normalAxis,int solverBodyIdA,int solverBodyIdB, void setupFrictionConstraint( btSolverConstraint& solverConstraint, const btVector3& normalAxis,int solverBodyIdA,int solverBodyIdB,
btManifoldPoint& cp,const btVector3& rel_pos1,const btVector3& rel_pos2, btManifoldPoint& cp,const btVector3& rel_pos1,const btVector3& rel_pos2,
btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation, btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation,
@@ -112,9 +118,7 @@ public:
virtual ~btSequentialImpulseConstraintSolver(); virtual ~btSequentialImpulseConstraintSolver();
virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& info, btIDebugDraw* debugDrawer,btDispatcher* dispatcher); virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& info, btIDebugDraw* debugDrawer,btDispatcher* dispatcher);
///clear internal cached data and reset random seed ///clear internal cached data and reset random seed
virtual void reset(); virtual void reset();
@@ -136,6 +140,34 @@ public:
{ {
return BT_SEQUENTIAL_IMPULSE_SOLVER; return BT_SEQUENTIAL_IMPULSE_SOLVER;
} }
btSingleConstraintRowSolver getActiveConstraintRowSolverGeneric()
{
return m_resolveSingleConstraintRowGeneric;
}
void setConstraintRowSolverGeneric(btSingleConstraintRowSolver rowSolver)
{
m_resolveSingleConstraintRowGeneric = rowSolver;
}
btSingleConstraintRowSolver getActiveConstraintRowSolverLowerLimit()
{
return m_resolveSingleConstraintRowLowerLimit;
}
void setConstraintRowSolverLowerLimit(btSingleConstraintRowSolver rowSolver)
{
m_resolveSingleConstraintRowLowerLimit = rowSolver;
}
// btSingleConstraintRowSolver getScalarConstraintRowSolverLowerLimit();
btSingleConstraintRowSolver getSSE2ConstraintRowSolverGeneric();
btSingleConstraintRowSolver getSSE4_1ConstraintRowSolverGeneric();
btSingleConstraintRowSolver getSSE2ConstraintRowSolverLowerLimit();
btSingleConstraintRowSolver getSSE4_1ConstraintRowSolverLowerLimit();
}; };

View File

@@ -0,0 +1,87 @@
#ifndef BT_CPU_UTILITY_H
#define BT_CPU_UTILITY_H
#include "LinearMath/btScalar.h"
#include <string.h>//memset
#ifdef BT_USE_SSE
#if (_MSC_FULL_VER >= 160040219)
#include <intrin.h>
#endif
#endif
#if defined BT_USE_NEON
#define ARM_NEON_GCC_COMPATIBILITY 1
#include <arm_neon.h>
#include <sys/types.h>
#include <sys/sysctl.h> //for sysctlbyname
#endif //BT_USE_NEON
///Rudimentary btCpuFeatureUtility for CPU features: only report the features that Bullet actually uses (SSE4/FMA3, NEON_HPFP)
///We assume SSE2 in case BT_USE_SSE2 is defined in LinearMath/btScalar.h
class btCpuFeatureUtility
{
public:
enum btCpuFeature
{
CPU_FEATURE_FMA3=1,
CPU_FEATURE_SSE4_1=2,
CPU_FEATURE_NEON_HPFP=4
};
static int getCpuFeatures(btCpuFeature inFeature)
{
static int capabilities = 0;
static bool testedCapabilities = false;
if (0 != testedCapabilities)
{
return capabilities;
}
#ifdef BT_USE_NEON
{
uint32_t hasFeature = 0;
size_t featureSize = sizeof(hasFeature);
int err = sysctlbyname("hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0);
if (0 == err && hasFeature)
capabilities |= CPU_FEATURE_NEON_HPFP;
}
#endif //BT_USE_NEON
#ifdef BT_USE_SSE
#if (_MSC_FULL_VER >= 160040219)
{
int cpuInfo[4];
memset(cpuInfo, 0, sizeof(cpuInfo));
unsigned long long sseExt;
__cpuid(mCpuInfo, 1);
mExt = _xgetbv(0);
const int OSXSAVEFlag = (1UL << 27);
const int AVXFlag = ((1UL << 28) | OSXSAVEFlag);
const int FMAFlag = ((1UL << 12) | AVXFlag | OSXSAVEFlag);
if ((mCpuInfo[2] & FMAFlag) == FMAFlag && (mExt & 6) == 6)
{
capabilities |= btCpuFeatureUtility::CPU_FEATURE_FMA3;
}
const int SSE41Flag = (1 << 19);
if (mCpuInfo[2] & SSE41Flag)
{
capabilities |= btCpuFeatureUtility::CPU_FEATURE_SSE4_1;
}
}
#endif//(_MSC_FULL_VER >= 160040219)
#endif//BT_USE_SSE
testedCapabilities = true;
return capabilities;
}
};
#endif //BT_CPU_UTILITY_H