#include "b3GpuPgsJacobiSolver.h" #include "Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h" #include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h" #include #include "Bullet3Common/b3AlignedObjectArray.h" #include //for memset #include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h" #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" #include "Bullet3OpenCL/RigidBody/kernels/jointSolver.h" //solveConstraintRowsCL #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #define B3_JOINT_SOLVER_PATH "src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl" struct b3GpuPgsJacobiSolverInternalData { cl_context m_context; cl_device_id m_device; cl_command_queue m_queue; cl_kernel m_solveJointConstraintRowsKernels; b3OpenCLArray* m_gpuSolverConstraintRows; b3OpenCLArray* m_gpuSolverBodies; b3OpenCLArray* m_gpuBatchConstraints; b3OpenCLArray* m_gpuConstraintRows; }; b3GpuPgsJacobiSolver::b3GpuPgsJacobiSolver (cl_context ctx, cl_device_id device, cl_command_queue queue,bool usePgs) :b3PgsJacobiSolver (usePgs) { m_gpuData = new b3GpuPgsJacobiSolverInternalData(); m_gpuData->m_context = ctx; m_gpuData->m_device = device; m_gpuData->m_queue = queue; m_gpuData->m_gpuSolverConstraintRows = new b3OpenCLArray(ctx,queue); m_gpuData->m_gpuSolverBodies = new b3OpenCLArray(m_gpuData->m_context,m_gpuData->m_queue); m_gpuData->m_gpuBatchConstraints = new b3OpenCLArray(m_gpuData->m_context,m_gpuData->m_queue); m_gpuData->m_gpuConstraintRows = new b3OpenCLArray(m_gpuData->m_context,m_gpuData->m_queue); cl_int errNum=0; { cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,&errNum,"",B3_JOINT_SOLVER_PATH); b3Assert(errNum==CL_SUCCESS); m_gpuData->m_solveJointConstraintRowsKernels = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device,solveConstraintRowsCL, "solveJointConstraintRows",&errNum,prog); b3Assert(errNum==CL_SUCCESS); clReleaseProgram(prog); } } b3GpuPgsJacobiSolver::~b3GpuPgsJacobiSolver () { clReleaseKernel(m_gpuData->m_solveJointConstraintRowsKernels); delete m_gpuData->m_gpuSolverConstraintRows; delete m_gpuData->m_gpuSolverBodies; delete m_gpuData->m_gpuBatchConstraints; delete m_gpuData->m_gpuConstraintRows; delete m_gpuData; } struct b3BatchConstraint { int m_bodyAPtrAndSignBit; int m_bodyBPtrAndSignBit; int m_constraintRowOffset; short int m_numConstraintRows; short int m_batchId; short int& getBatchIdx() { return m_batchId; } }; static b3AlignedObjectArray batchConstraints; static b3AlignedObjectArray batches; b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlySetup(b3RigidBodyCL* bodies, b3InertiaCL* inertias, int numBodies, b3Contact4* manifoldPtr, int numManifolds,b3TypedConstraint** constraints,int numConstraints,const b3ContactSolverInfo& infoGlobal) { B3_PROFILE("GPU solveGroupCacheFriendlySetup"); batchConstraints.resize(numConstraints); m_staticIdx = -1; m_maxOverrideNumSolverIterations = 0; m_tmpSolverBodyPool.resize(0); m_bodyCount.resize(0); m_bodyCount.resize(numBodies,0); m_bodyCountCheck.resize(0); m_bodyCountCheck.resize(numBodies,0); m_deltaLinearVelocities.resize(0); m_deltaLinearVelocities.resize(numBodies,b3Vector3(0,0,0)); m_deltaAngularVelocities.resize(0); m_deltaAngularVelocities.resize(numBodies,b3Vector3(0,0,0)); int totalBodies = 0; for (int i=0;igetRigidBodyA(); int bodyIndexB = constraints[i]->getRigidBodyB(); if (m_usePgs) { m_bodyCount[bodyIndexA]=-1; m_bodyCount[bodyIndexB]=-1; } else { //didn't implement joints with Jacobi version yet b3Assert(0); } } for (int i=0;iinternalSetAppliedImpulse(0.0f); } } //b3RigidBody* rb0=0,*rb1=0; //if (1) { { int totalNumRows = 0; int i; m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints); //calculate the total number of contraint rows for (i=0;igetJointFeedback(); if (fb) { fb->m_appliedForceBodyA.setZero(); fb->m_appliedTorqueBodyA.setZero(); fb->m_appliedForceBodyB.setZero(); fb->m_appliedTorqueBodyB.setZero(); } if (constraints[i]->isEnabled()) { } if (constraints[i]->isEnabled()) { constraints[i]->getInfo1(&info1,bodies); } else { info1.m_numConstraintRows = 0; info1.nub = 0; } batchConstraints[i].m_numConstraintRows = info1.m_numConstraintRows; batchConstraints[i].m_constraintRowOffset = totalNumRows; totalNumRows += info1.m_numConstraintRows; } m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows); #ifndef DISABLE_JOINTS ///setup the b3SolverConstraints int currentRow = 0; for (i=0;igetRigidBodyA()]; //b3RigidBody& rbA = constraint->getRigidBodyA(); // b3RigidBody& rbB = constraint->getRigidBodyB(); b3RigidBodyCL& rbB = bodies[ constraint->getRigidBodyB()]; int solverBodyIdA = getOrInitSolverBody(constraint->getRigidBodyA(),bodies,inertias); int solverBodyIdB = getOrInitSolverBody(constraint->getRigidBodyB(),bodies,inertias); b3SolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA]; b3SolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB]; if (rbA.getInvMass()) { batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA; } else { if (!solverBodyIdA) m_staticIdx = 0; batchConstraints[i].m_bodyAPtrAndSignBit = -solverBodyIdA; } if (rbB.getInvMass()) { batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB; } else { if (!solverBodyIdB) m_staticIdx = 0; batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB; } int overrideNumSolverIterations = constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations; if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations) m_maxOverrideNumSolverIterations = overrideNumSolverIterations; int j; for ( j=0;jinternalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f); bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f); bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f); bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f); bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f); bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f); bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f); bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f); b3TypedConstraint::b3ConstraintInfo2 info2; info2.fps = 1.f/infoGlobal.m_timeStep; info2.erp = infoGlobal.m_erp; info2.m_J1linearAxis = currentConstraintRow->m_contactNormal; info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal; info2.m_J2linearAxis = 0; info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal; info2.rowskip = sizeof(b3SolverConstraint)/sizeof(b3Scalar);//check this ///the size of b3SolverConstraint needs be a multiple of b3Scalar b3Assert(info2.rowskip*sizeof(b3Scalar)== sizeof(b3SolverConstraint)); info2.m_constraintError = ¤tConstraintRow->m_rhs; currentConstraintRow->m_cfm = infoGlobal.m_globalCfm; info2.m_damping = infoGlobal.m_damping; info2.cfm = ¤tConstraintRow->m_cfm; info2.m_lowerLimit = ¤tConstraintRow->m_lowerLimit; info2.m_upperLimit = ¤tConstraintRow->m_upperLimit; info2.m_numIterations = infoGlobal.m_numIterations; constraints[i]->getInfo2(&info2,bodies); ///finalize the constraint setup for ( j=0;j=constraints[i]->getBreakingImpulseThreshold()) { solverConstraint.m_upperLimit = constraints[i]->getBreakingImpulseThreshold(); } if (solverConstraint.m_lowerLimit<=-constraints[i]->getBreakingImpulseThreshold()) { solverConstraint.m_lowerLimit = -constraints[i]->getBreakingImpulseThreshold(); } solverConstraint.m_originalContactPoint = constraint; b3Matrix3x3& invInertiaWorldA= inertias[constraint->getRigidBodyA()].m_invInertiaWorld; { //b3Vector3 angularFactorA(1,1,1); const b3Vector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal; solverConstraint.m_angularComponentA = invInertiaWorldA*ftorqueAxis1;//*angularFactorA; } b3Matrix3x3& invInertiaWorldB= inertias[constraint->getRigidBodyB()].m_invInertiaWorld; { const b3Vector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal; solverConstraint.m_angularComponentB = invInertiaWorldB*ftorqueAxis2;//*constraint->getRigidBodyB().getAngularFactor(); } { //it is ok to use solverConstraint.m_contactNormal instead of -solverConstraint.m_contactNormal //because it gets multiplied iMJlB b3Vector3 iMJlA = solverConstraint.m_contactNormal*rbA.getInvMass(); b3Vector3 iMJaA = invInertiaWorldA*solverConstraint.m_relpos1CrossNormal; b3Vector3 iMJlB = solverConstraint.m_contactNormal*rbB.getInvMass();//sign of normal? b3Vector3 iMJaB = invInertiaWorldB*solverConstraint.m_relpos2CrossNormal; b3Scalar sum = iMJlA.dot(solverConstraint.m_contactNormal); sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal); sum += iMJlB.dot(solverConstraint.m_contactNormal); sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal); b3Scalar fsum = b3Fabs(sum); b3Assert(fsum > B3_EPSILON); solverConstraint.m_jacDiagABInv = fsum>B3_EPSILON?b3Scalar(1.)/sum : 0.f; } ///fix rhs ///todo: add force/torque accelerators { b3Scalar rel_vel; b3Scalar vel1Dotn = solverConstraint.m_contactNormal.dot(rbA.m_linVel) + solverConstraint.m_relpos1CrossNormal.dot(rbA.m_angVel); b3Scalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rbB.m_linVel) + solverConstraint.m_relpos2CrossNormal.dot(rbB.m_angVel); rel_vel = vel1Dotn+vel2Dotn; b3Scalar restitution = 0.f; b3Scalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2 b3Scalar velocityError = restitution - rel_vel * info2.m_damping; b3Scalar penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv; b3Scalar velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv; solverConstraint.m_rhs = penetrationImpulse+velocityImpulse; solverConstraint.m_appliedImpulse = 0.f; } } } currentRow+=m_tmpConstraintSizesPool[i].m_numConstraintRows; } #endif //DISABLE_JOINTS } { int i; for (i=0;im_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor; body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor); } void resolveSingleConstraintRowGeneric2( b3SolverBody* body1, b3SolverBody* body2, b3SolverConstraint* c) { float deltaImpulse = c->m_rhs-c->m_appliedImpulse.x*c->m_cfm; float deltaVel1Dotn = b3Dot(c->m_contactNormal,body1->m_deltaLinearVelocity) + b3Dot(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity); float deltaVel2Dotn = -b3Dot(c->m_contactNormal,body2->m_deltaLinearVelocity) + b3Dot(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity); deltaImpulse -= deltaVel1Dotn*c->m_jacDiagABInv; deltaImpulse -= deltaVel2Dotn*c->m_jacDiagABInv; float sum = c->m_appliedImpulse.x + deltaImpulse; if (sum < c->m_lowerLimit) { deltaImpulse = c->m_lowerLimit-c->m_appliedImpulse.x; c->m_appliedImpulse.x = c->m_lowerLimit; } else if (sum > c->m_upperLimit) { deltaImpulse = c->m_upperLimit-c->m_appliedImpulse.x; c->m_appliedImpulse.x = c->m_upperLimit; } else { c->m_appliedImpulse.x = sum; } internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse); internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse); } b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyIterations(b3TypedConstraint** cpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal) { bool useb3PgsJacobiSolver = false; bool createBatches = batches.size()==0; if (useb3PgsJacobiSolver) { return b3PgsJacobiSolver::solveGroupCacheFriendlyIterations(cpuConstraints,numConstraints,infoGlobal); } else { B3_PROFILE("GpuSolveGroupCacheFriendlyIterations"); if (createBatches) { batches.resize(0); { B3_PROFILE("batch joints"); b3Assert(batchConstraints.size()==numConstraints); int simdWidth =numConstraints; int numBodies = m_tmpSolverBodyPool.size(); sortConstraintByBatch3( &batchConstraints[0], numConstraints, simdWidth , m_staticIdx, numBodies); } } int maxIterations = infoGlobal.m_numIterations; bool useBatching = true; bool useGpu=false; if (useBatching ) { if (useGpu) { B3_PROFILE("copy from host"); m_gpuData->m_gpuSolverConstraintRows->copyFromHost(m_tmpSolverNonContactConstraintPool); m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool); m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints); m_gpuData->m_gpuConstraintRows->copyFromHost(m_tmpSolverNonContactConstraintPool); } for ( int iteration = 0 ; iteration< maxIterations ; iteration++) { int batchOffset = 0; int constraintOffset=0; int numBatches = batches.size(); for (int bb=0;bbm_queue,m_gpuData->m_solveJointConstraintRowsKernels); launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL()); launcher.setConst(batchOffset); launcher.setConst(constraintOffset); launcher.setConst(numConstraintsInBatch); launcher.launch1D(numConstraintsInBatch); clFinish(m_gpuData->m_queue); } else { for (int b=0;bm_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); } //int sz = sizeof(b3SolverBody); //printf("cpu sizeof(b3SolverBody)=%d\n",sz); } else { for ( int iteration = 0 ; iteration< maxIterations ; iteration++) { int numJoints = m_tmpSolverNonContactConstraintPool.size(); for (int j=0;j bodyUsed; static b3AlignedObjectArray curUsed; inline int b3GpuPgsJacobiSolver::sortConstraintByBatch3( b3BatchConstraint* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies) { int sz = sizeof(b3BatchConstraint); B3_PROFILE("sortConstraintByBatch3"); static int maxSwaps = 0; int numSwaps = 0; curUsed.resize(2*simdWidth); static int maxNumConstraints = 0; if (maxNumConstraints