diff --git a/ObsoleteDemos/CMakeLists.txt b/ObsoleteDemos/CMakeLists.txt index 912c348a6..c6d13ac9c 100644 --- a/ObsoleteDemos/CMakeLists.txt +++ b/ObsoleteDemos/CMakeLists.txt @@ -10,7 +10,7 @@ IF (USE_GLUT) # ENDIF() SET(SharedDemoSubdirs OpenGL - CcdPhysicsDemo SliderConstraintDemo GenericJointDemo Raytracer + CcdPhysicsDemo ConstraintDemo SliderConstraintDemo GenericJointDemo Raytracer RagdollDemo ForkLiftDemo BasicDemo FeatherstoneMultiBodyDemo RollingFrictionDemo RaytestDemo VoronoiFractureDemo GyroscopicDemo FractureDemo Box2dDemo BspDemo MovingConcaveDemo VehicleDemo UserCollisionAlgorithm CharacterDemo SoftDemo diff --git a/ObsoleteDemos/ConstraintDemo/CMakeLists.txt b/ObsoleteDemos/ConstraintDemo/CMakeLists.txt index 70945677b..d43a3bfa4 100644 --- a/ObsoleteDemos/ConstraintDemo/CMakeLists.txt +++ b/ObsoleteDemos/ConstraintDemo/CMakeLists.txt @@ -14,14 +14,14 @@ INCLUDE_DIRECTORIES( ${BULLET_PHYSICS_SOURCE_DIR}/src ../OpenGL -${BULLET_PHYSICS_SOURCE_DIR}/Extras/Serialize/BulletFileLoader -${BULLET_PHYSICS_SOURCE_DIR}/Extras/Serialize/BulletWorldImporter +${GLUT_INCLUDE_DIR} ) IF (USE_GLUT) LINK_LIBRARIES( - OpenGLSupport BulletWorldImporter BulletDynamics BulletCollision LinearMath BulletFileLoader ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} + OpenGLSupport BulletDynamics BulletCollision LinearMath ${GLUT_glut_LIBRARY} + ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} ) ADD_EXECUTABLE(AppConstraintDemo @@ -31,7 +31,7 @@ IF (USE_GLUT) ) ELSE (USE_GLUT) LINK_LIBRARIES( - OpenGLSupport BulletWorldImporter BulletDynamics BulletCollision LinearMath BulletFileLoader ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} + OpenGLSupport BulletDynamics BulletCollision LinearMath ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} ) ADD_EXECUTABLE(AppConstraintDemo diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp index 7d749a0cb..3608c40df 100644 --- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp +++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp @@ -37,57 +37,164 @@ int gNumSplitImpulseRecoveries = 0; #include "BulletDynamics/Dynamics/btRigidBody.h" -btSequentialImpulseConstraintSolver::btSequentialImpulseConstraintSolver() -:m_btSeed2(0) -{ -} - -btSequentialImpulseConstraintSolver::~btSequentialImpulseConstraintSolver() -{ -} #ifdef USE_SIMD -#include +#include //is it safe to include this header? + #define btVecSplat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e,e,e,e)) static inline __m128 btSimdDot3( __m128 vec0, __m128 vec1 ) { __m128 result = _mm_mul_ps( vec0, vec1); return _mm_add_ps( btVecSplat( result, 0 ), _mm_add_ps( btVecSplat( result, 1 ), btVecSplat( result, 2 ) ) ); } -#endif//USE_SIMD + +#define USE_FMA 1 +#define USE_FMA3_INSTEAD_FMA4 1 +#define USE_SSE4_DOT 0 + +#define SSE4_DP(a, b) _mm_dp_ps(a, b, 0x7f) +#define SSE4_DP_FP(a, b) _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7f)) + +#if USE_SSE4_DOT +#define DOT_PRODUCT(a, b) SSE4_DP(a, b) +#else +#define DOT_PRODUCT(a, b) btSimdDot3(a, b) +#endif + +#if USE_FMA +#if USE_FMA3_INSTEAD_FMA4 +// a*b + c +#define FMADD(a, b, c) _mm_fmadd_ps(a, b, c) +// -(a*b) + c +#define FMNADD(a, b, c) _mm_fnmadd_ps(a, b, c) +#else // USE_FMA3 +// a*b + c +#define FMADD(a, b, c) _mm_macc_ps(a, b, c) +// -(a*b) + c +#define FMNADD(a, b, c) _mm_nmacc_ps(a, b, c) +#endif +#else // USE_FMA +// c + a*b +#define FMADD(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b)) +// c - a*b +#define FMNADD(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b)) +#endif // Project Gauss Seidel or the equivalent Sequential Impulse -btSimdScalar btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGenericSIMD(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c) +static btSimdScalar gResolveSingleConstraintRowGeneric_sse2(btSolverBody& body1, btSolverBody& body2, const btSolverConstraint& c) { -#ifdef USE_SIMD __m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse); __m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit); __m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit); - btSimdScalar deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse),_mm_set1_ps(c.m_cfm))); - __m128 deltaVel1Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal1.mVec128,body1.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128,body1.internalGetDeltaAngularVelocity().mVec128)); - __m128 deltaVel2Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal2.mVec128,body2.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos2CrossNormal.mVec128,body2.internalGetDeltaAngularVelocity().mVec128)); - deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv))); - deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv))); - btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse); - btSimdScalar resultLowerLess,resultUpperLess; - resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1); - resultUpperLess = _mm_cmplt_ps(sum,upperLimit1); - __m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp); - deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) ); - c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) ); - __m128 upperMinApplied = _mm_sub_ps(upperLimit1,cpAppliedImp); - deltaImpulse = _mm_or_ps( _mm_and_ps(resultUpperLess, deltaImpulse), _mm_andnot_ps(resultUpperLess, upperMinApplied) ); - c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultUpperLess, c.m_appliedImpulse), _mm_andnot_ps(resultUpperLess, upperLimit1) ); - __m128 linearComponentA = _mm_mul_ps(c.m_contactNormal1.mVec128,body1.internalGetInvMass().mVec128); - __m128 linearComponentB = _mm_mul_ps((c.m_contactNormal2).mVec128,body2.internalGetInvMass().mVec128); + btSimdScalar deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse), _mm_set1_ps(c.m_cfm))); + __m128 deltaVel1Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal1.mVec128, body1.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128, body1.internalGetDeltaAngularVelocity().mVec128)); + __m128 deltaVel2Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal2.mVec128, body2.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos2CrossNormal.mVec128, body2.internalGetDeltaAngularVelocity().mVec128)); + deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel1Dotn, _mm_set1_ps(c.m_jacDiagABInv))); + deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel2Dotn, _mm_set1_ps(c.m_jacDiagABInv))); + btSimdScalar sum = _mm_add_ps(cpAppliedImp, deltaImpulse); + btSimdScalar resultLowerLess, resultUpperLess; + resultLowerLess = _mm_cmplt_ps(sum, lowerLimit1); + resultUpperLess = _mm_cmplt_ps(sum, upperLimit1); + __m128 lowMinApplied = _mm_sub_ps(lowerLimit1, cpAppliedImp); + deltaImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse)); + c.m_appliedImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum)); + __m128 upperMinApplied = _mm_sub_ps(upperLimit1, cpAppliedImp); + deltaImpulse = _mm_or_ps(_mm_and_ps(resultUpperLess, deltaImpulse), _mm_andnot_ps(resultUpperLess, upperMinApplied)); + c.m_appliedImpulse = _mm_or_ps(_mm_and_ps(resultUpperLess, c.m_appliedImpulse), _mm_andnot_ps(resultUpperLess, upperLimit1)); + __m128 linearComponentA = _mm_mul_ps(c.m_contactNormal1.mVec128, body1.internalGetInvMass().mVec128); + __m128 linearComponentB = _mm_mul_ps((c.m_contactNormal2).mVec128, body2.internalGetInvMass().mVec128); __m128 impulseMagnitude = deltaImpulse; - body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude)); - body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude)); - body2.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude)); - body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude)); - + body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentA, impulseMagnitude)); + body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentA.mVec128, impulseMagnitude)); + body2.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentB, impulseMagnitude)); + body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentB.mVec128, impulseMagnitude)); return deltaImpulse; +} + + +// Enhanced version of gResolveSingleConstraintRowGeneric_sse2 with SSE4.1 and FMA3 +static btSimdScalar gResolveSingleConstraintRowGeneric_sse4_1_fma3(btSolverBody& body1, btSolverBody& body2, const btSolverConstraint& c) +{ + __m128 tmp = _mm_set_ps1(c.m_jacDiagABInv); + __m128 deltaImpulse = _mm_set_ps1(c.m_rhs - btScalar(c.m_appliedImpulse)*c.m_cfm); + const __m128 lowerLimit = _mm_set_ps1(c.m_lowerLimit); + const __m128 upperLimit = _mm_set_ps1(c.m_upperLimit); + const __m128 deltaVel1Dotn = _mm_add_ps(DOT_PRODUCT(c.m_contactNormal1.mVec128, body1.internalGetDeltaLinearVelocity().mVec128), DOT_PRODUCT(c.m_relpos1CrossNormal.mVec128, body1.internalGetDeltaAngularVelocity().mVec128)); + const __m128 deltaVel2Dotn = _mm_add_ps(DOT_PRODUCT(c.m_contactNormal2.mVec128, body2.internalGetDeltaLinearVelocity().mVec128), DOT_PRODUCT(c.m_relpos2CrossNormal.mVec128, body2.internalGetDeltaAngularVelocity().mVec128)); + deltaImpulse = FMNADD(deltaVel1Dotn, tmp, deltaImpulse); + deltaImpulse = FMNADD(deltaVel2Dotn, tmp, deltaImpulse); + tmp = _mm_add_ps(c.m_appliedImpulse, deltaImpulse); // sum + const __m128 maskLower = _mm_cmpgt_ps(tmp, lowerLimit); + const __m128 maskUpper = _mm_cmpgt_ps(upperLimit, tmp); + deltaImpulse = _mm_blendv_ps(_mm_sub_ps(lowerLimit, c.m_appliedImpulse), _mm_blendv_ps(_mm_sub_ps(upperLimit, c.m_appliedImpulse), deltaImpulse, maskUpper), maskLower); + c.m_appliedImpulse = _mm_blendv_ps(lowerLimit, _mm_blendv_ps(upperLimit, tmp, maskUpper), maskLower); + body1.internalGetDeltaLinearVelocity().mVec128 = FMADD(_mm_mul_ps(c.m_contactNormal1.mVec128, body1.internalGetInvMass().mVec128), deltaImpulse, body1.internalGetDeltaLinearVelocity().mVec128); + body1.internalGetDeltaAngularVelocity().mVec128 = FMADD(c.m_angularComponentA.mVec128, deltaImpulse, body1.internalGetDeltaAngularVelocity().mVec128); + body2.internalGetDeltaLinearVelocity().mVec128 = FMADD(_mm_mul_ps(c.m_contactNormal2.mVec128, body2.internalGetInvMass().mVec128), deltaImpulse, body2.internalGetDeltaLinearVelocity().mVec128); + body2.internalGetDeltaAngularVelocity().mVec128 = FMADD(c.m_angularComponentB.mVec128, deltaImpulse, body2.internalGetDeltaAngularVelocity().mVec128); + return deltaImpulse; +} + + +static btSimdScalar gResolveSingleConstraintRowLowerLimit_sse2(btSolverBody& body1, btSolverBody& body2, const btSolverConstraint& c) +{ + __m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse); + __m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit); + __m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit); + btSimdScalar deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse), _mm_set1_ps(c.m_cfm))); + __m128 deltaVel1Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal1.mVec128, body1.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128, body1.internalGetDeltaAngularVelocity().mVec128)); + __m128 deltaVel2Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal2.mVec128, body2.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos2CrossNormal.mVec128, body2.internalGetDeltaAngularVelocity().mVec128)); + deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel1Dotn, _mm_set1_ps(c.m_jacDiagABInv))); + deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel2Dotn, _mm_set1_ps(c.m_jacDiagABInv))); + btSimdScalar sum = _mm_add_ps(cpAppliedImp, deltaImpulse); + btSimdScalar resultLowerLess, resultUpperLess; + resultLowerLess = _mm_cmplt_ps(sum, lowerLimit1); + resultUpperLess = _mm_cmplt_ps(sum, upperLimit1); + __m128 lowMinApplied = _mm_sub_ps(lowerLimit1, cpAppliedImp); + deltaImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse)); + c.m_appliedImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum)); + __m128 linearComponentA = _mm_mul_ps(c.m_contactNormal1.mVec128, body1.internalGetInvMass().mVec128); + __m128 linearComponentB = _mm_mul_ps(c.m_contactNormal2.mVec128, body2.internalGetInvMass().mVec128); + __m128 impulseMagnitude = deltaImpulse; + body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentA, impulseMagnitude)); + body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentA.mVec128, impulseMagnitude)); + body2.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentB, impulseMagnitude)); + body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentB.mVec128, impulseMagnitude)); + return deltaImpulse; +} + + +// Enhanced version of gResolveSingleConstraintRowGeneric_sse2 with SSE4.1 and FMA3 +static btSimdScalar gResolveSingleConstraintRowLowerLimit_sse4_1_fma3(btSolverBody& body1, btSolverBody& body2, const btSolverConstraint& c) +{ + __m128 tmp = _mm_set_ps1(c.m_jacDiagABInv); + __m128 deltaImpulse = _mm_set_ps1(c.m_rhs - btScalar(c.m_appliedImpulse)*c.m_cfm); + const __m128 lowerLimit = _mm_set_ps1(c.m_lowerLimit); + const __m128 deltaVel1Dotn = _mm_add_ps(DOT_PRODUCT(c.m_contactNormal1.mVec128, body1.internalGetDeltaLinearVelocity().mVec128), DOT_PRODUCT(c.m_relpos1CrossNormal.mVec128, body1.internalGetDeltaAngularVelocity().mVec128)); + const __m128 deltaVel2Dotn = _mm_add_ps(DOT_PRODUCT(c.m_contactNormal2.mVec128, body2.internalGetDeltaLinearVelocity().mVec128), DOT_PRODUCT(c.m_relpos2CrossNormal.mVec128, body2.internalGetDeltaAngularVelocity().mVec128)); + deltaImpulse = FMNADD(deltaVel1Dotn, tmp, deltaImpulse); + deltaImpulse = FMNADD(deltaVel2Dotn, tmp, deltaImpulse); + tmp = _mm_add_ps(c.m_appliedImpulse, deltaImpulse); + const __m128 mask = _mm_cmpgt_ps(tmp, lowerLimit); + deltaImpulse = _mm_blendv_ps(_mm_sub_ps(lowerLimit, c.m_appliedImpulse), deltaImpulse, mask); + c.m_appliedImpulse = _mm_blendv_ps(lowerLimit, tmp, mask); + body1.internalGetDeltaLinearVelocity().mVec128 = FMADD(_mm_mul_ps(c.m_contactNormal1.mVec128, body1.internalGetInvMass().mVec128), deltaImpulse, body1.internalGetDeltaLinearVelocity().mVec128); + body1.internalGetDeltaAngularVelocity().mVec128 = FMADD(c.m_angularComponentA.mVec128, deltaImpulse, body1.internalGetDeltaAngularVelocity().mVec128); + body2.internalGetDeltaLinearVelocity().mVec128 = FMADD(_mm_mul_ps(c.m_contactNormal2.mVec128, body2.internalGetInvMass().mVec128), deltaImpulse, body2.internalGetDeltaLinearVelocity().mVec128); + body2.internalGetDeltaAngularVelocity().mVec128 = FMADD(c.m_angularComponentB.mVec128, deltaImpulse, body2.internalGetDeltaAngularVelocity().mVec128); + return deltaImpulse; +} + + +#endif //USE_SIMD + + + +btSimdScalar btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGenericSIMD(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c) +{ +#ifdef USE_SIMD + return m_resolveSingleConstraintRowGeneric(body1, body2, c); #else return resolveSingleConstraintRowGeneric(body1,body2,c); #endif @@ -129,29 +236,7 @@ btSimdScalar btSequentialImpulseConstraintSolver::resolveSingleConstraintRowGene btSimdScalar btSequentialImpulseConstraintSolver::resolveSingleConstraintRowLowerLimitSIMD(btSolverBody& body1,btSolverBody& body2,const btSolverConstraint& c) { #ifdef USE_SIMD - __m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse); - __m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit); - __m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit); - btSimdScalar deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse),_mm_set1_ps(c.m_cfm))); - __m128 deltaVel1Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal1.mVec128,body1.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos1CrossNormal.mVec128,body1.internalGetDeltaAngularVelocity().mVec128)); - __m128 deltaVel2Dotn = _mm_add_ps(btSimdDot3(c.m_contactNormal2.mVec128,body2.internalGetDeltaLinearVelocity().mVec128), btSimdDot3(c.m_relpos2CrossNormal.mVec128,body2.internalGetDeltaAngularVelocity().mVec128)); - deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel1Dotn,_mm_set1_ps(c.m_jacDiagABInv))); - deltaImpulse = _mm_sub_ps(deltaImpulse,_mm_mul_ps(deltaVel2Dotn,_mm_set1_ps(c.m_jacDiagABInv))); - btSimdScalar sum = _mm_add_ps(cpAppliedImp,deltaImpulse); - btSimdScalar resultLowerLess,resultUpperLess; - resultLowerLess = _mm_cmplt_ps(sum,lowerLimit1); - resultUpperLess = _mm_cmplt_ps(sum,upperLimit1); - __m128 lowMinApplied = _mm_sub_ps(lowerLimit1,cpAppliedImp); - deltaImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse) ); - c.m_appliedImpulse = _mm_or_ps( _mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum) ); - __m128 linearComponentA = _mm_mul_ps(c.m_contactNormal1.mVec128,body1.internalGetInvMass().mVec128); - __m128 linearComponentB = _mm_mul_ps(c.m_contactNormal2.mVec128,body2.internalGetInvMass().mVec128); - __m128 impulseMagnitude = deltaImpulse; - body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentA,impulseMagnitude)); - body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentA.mVec128,impulseMagnitude)); - body2.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaLinearVelocity().mVec128,_mm_mul_ps(linearComponentB,impulseMagnitude)); - body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128 ,_mm_mul_ps(c.m_angularComponentB.mVec128,impulseMagnitude)); - return deltaImpulse; + return m_resolveSingleConstraintRowLowerLimit(body1, body2, c); #else return resolveSingleConstraintRowLowerLimit(body1,body2,c); #endif @@ -248,6 +333,36 @@ void btSequentialImpulseConstraintSolver::resolveSplitPenetrationImpulseCacheFri } + btSequentialImpulseConstraintSolver::btSequentialImpulseConstraintSolver() + :m_btSeed2(0), + m_resolveSingleConstraintRowGeneric(gResolveSingleConstraintRowGeneric_sse2), + m_resolveSingleConstraintRowLowerLimit(gResolveSingleConstraintRowLowerLimit_sse2) + { + + } + + btSequentialImpulseConstraintSolver::~btSequentialImpulseConstraintSolver() + { + } + + + btSingleConstraintRowSolver btSequentialImpulseConstraintSolver::getSSE2ConstraintRowSolverGeneric() + { + return gResolveSingleConstraintRowGeneric_sse2; + } + btSingleConstraintRowSolver btSequentialImpulseConstraintSolver::getSSE4_1ConstraintRowSolverGeneric() + { + return gResolveSingleConstraintRowGeneric_sse4_1_fma3; + } + btSingleConstraintRowSolver btSequentialImpulseConstraintSolver::getSSE2ConstraintRowSolverLowerLimit() + { + return gResolveSingleConstraintRowLowerLimit_sse2; + } + btSingleConstraintRowSolver btSequentialImpulseConstraintSolver::getSSE4_1ConstraintRowSolverLowerLimit() + { + return gResolveSingleConstraintRowLowerLimit_sse4_1_fma3; + } + unsigned long btSequentialImpulseConstraintSolver::btRand2() { @@ -434,8 +549,8 @@ void btSequentialImpulseConstraintSolver::setupFrictionConstraint(btSolverConstr // btScalar positionalError = 0.f; - btScalar velocityError = desiredVelocity - rel_vel; - btScalar velocityImpulse = velocityError * solverConstraint.m_jacDiagABInv; + btSimdScalar velocityError = desiredVelocity - rel_vel; + btSimdScalar velocityImpulse = velocityError * btSimdScalar(solverConstraint.m_jacDiagABInv); solverConstraint.m_rhs = velocityImpulse; solverConstraint.m_rhsPenetration = 0.f; solverConstraint.m_cfm = cfmSlip; @@ -1741,5 +1856,3 @@ void btSequentialImpulseConstraintSolver::reset() { m_btSeed2 = 0; } - - diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h index 47dbbe339..001708a21 100644 --- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h +++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h @@ -27,6 +27,8 @@ class btCollisionObject; #include "BulletCollision/NarrowPhaseCollision/btManifoldPoint.h" #include "BulletDynamics/ConstraintSolver/btConstraintSolver.h" +typedef btSimdScalar(*btSingleConstraintRowSolver)(btSolverBody&, btSolverBody&, const btSolverConstraint&); + ///The btSequentialImpulseConstraintSolver is a fast SIMD implementation of the Projected Gauss Seidel (iterative LCP) method. ATTRIBUTE_ALIGNED16(class) btSequentialImpulseConstraintSolver : public btConstraintSolver { @@ -43,6 +45,10 @@ protected: btAlignedObjectArray m_tmpConstraintSizesPool; int m_maxOverrideNumSolverIterations; int m_fixedBodyId; + + btSingleConstraintRowSolver m_resolveSingleConstraintRowGeneric; + btSingleConstraintRowSolver m_resolveSingleConstraintRowLowerLimit; + void setupFrictionConstraint( btSolverConstraint& solverConstraint, const btVector3& normalAxis,int solverBodyIdA,int solverBodyIdB, btManifoldPoint& cp,const btVector3& rel_pos1,const btVector3& rel_pos2, btCollisionObject* colObj0,btCollisionObject* colObj1, btScalar relaxation, @@ -112,9 +118,7 @@ public: virtual ~btSequentialImpulseConstraintSolver(); virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& info, btIDebugDraw* debugDrawer,btDispatcher* dispatcher); - - - + ///clear internal cached data and reset random seed virtual void reset(); @@ -136,6 +140,34 @@ public: { return BT_SEQUENTIAL_IMPULSE_SOLVER; } + + btSingleConstraintRowSolver getActiveConstraintRowSolverGeneric() + { + return m_resolveSingleConstraintRowGeneric; + } + void setConstraintRowSolverGeneric(btSingleConstraintRowSolver rowSolver) + { + m_resolveSingleConstraintRowGeneric = rowSolver; + } + btSingleConstraintRowSolver getActiveConstraintRowSolverLowerLimit() + { + return m_resolveSingleConstraintRowLowerLimit; + } + void setConstraintRowSolverLowerLimit(btSingleConstraintRowSolver rowSolver) + { + m_resolveSingleConstraintRowLowerLimit = rowSolver; + } + + + // btSingleConstraintRowSolver getScalarConstraintRowSolverLowerLimit(); + btSingleConstraintRowSolver getSSE2ConstraintRowSolverGeneric(); + btSingleConstraintRowSolver getSSE4_1ConstraintRowSolverGeneric(); + btSingleConstraintRowSolver getSSE2ConstraintRowSolverLowerLimit(); + btSingleConstraintRowSolver getSSE4_1ConstraintRowSolverLowerLimit(); + + + + }; diff --git a/src/LinearMath/btCpuFeatureUtility.h b/src/LinearMath/btCpuFeatureUtility.h new file mode 100644 index 000000000..0c9ba99a9 --- /dev/null +++ b/src/LinearMath/btCpuFeatureUtility.h @@ -0,0 +1,87 @@ + +#ifndef BT_CPU_UTILITY_H +#define BT_CPU_UTILITY_H + +#include "LinearMath/btScalar.h" + +#include //memset +#ifdef BT_USE_SSE +#if (_MSC_FULL_VER >= 160040219) +#include +#endif +#endif + +#if defined BT_USE_NEON +#define ARM_NEON_GCC_COMPATIBILITY 1 +#include +#include +#include //for sysctlbyname +#endif //BT_USE_NEON + +///Rudimentary btCpuFeatureUtility for CPU features: only report the features that Bullet actually uses (SSE4/FMA3, NEON_HPFP) +///We assume SSE2 in case BT_USE_SSE2 is defined in LinearMath/btScalar.h +class btCpuFeatureUtility +{ +public: + enum btCpuFeature + { + CPU_FEATURE_FMA3=1, + CPU_FEATURE_SSE4_1=2, + CPU_FEATURE_NEON_HPFP=4 + }; + + static int getCpuFeatures(btCpuFeature inFeature) + { + + static int capabilities = 0; + static bool testedCapabilities = false; + if (0 != testedCapabilities) + { + return capabilities; + } + +#ifdef BT_USE_NEON + { + uint32_t hasFeature = 0; + size_t featureSize = sizeof(hasFeature); + int err = sysctlbyname("hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0); + if (0 == err && hasFeature) + capabilities |= CPU_FEATURE_NEON_HPFP; + } +#endif //BT_USE_NEON + +#ifdef BT_USE_SSE +#if (_MSC_FULL_VER >= 160040219) + { + int cpuInfo[4]; + memset(cpuInfo, 0, sizeof(cpuInfo)); + unsigned long long sseExt; + __cpuid(mCpuInfo, 1); + mExt = _xgetbv(0); + + const int OSXSAVEFlag = (1UL << 27); + const int AVXFlag = ((1UL << 28) | OSXSAVEFlag); + const int FMAFlag = ((1UL << 12) | AVXFlag | OSXSAVEFlag); + if ((mCpuInfo[2] & FMAFlag) == FMAFlag && (mExt & 6) == 6) + { + capabilities |= btCpuFeatureUtility::CPU_FEATURE_FMA3; + } + + const int SSE41Flag = (1 << 19); + if (mCpuInfo[2] & SSE41Flag) + { + capabilities |= btCpuFeatureUtility::CPU_FEATURE_SSE4_1; + } + } +#endif//(_MSC_FULL_VER >= 160040219) +#endif//BT_USE_SSE + + testedCapabilities = true; + return capabilities; + } + + +}; + + +#endif //BT_CPU_UTILITY_H