diff --git a/src/Bullet3Dynamics/ConstraintSolver/b3SolverBody.h b/src/Bullet3Dynamics/ConstraintSolver/b3SolverBody.h index b25c6c085..8ec5ec7aa 100644 --- a/src/Bullet3Dynamics/ConstraintSolver/b3SolverBody.h +++ b/src/Bullet3Dynamics/ConstraintSolver/b3SolverBody.h @@ -51,6 +51,7 @@ struct b3SimdScalar { __m128 m_vec128; float m_floats[4]; + float x,y,z,w; int m_ints[4]; b3Scalar m_unusedPadding; }; @@ -105,7 +106,7 @@ operator+(const b3SimdScalar& v1, const b3SimdScalar& v2) #endif ///The b3SolverBody is an internal datastructure for the constraint solver. Only necessary data is packed to increase cache coherence/performance. -B3_ATTRIBUTE_ALIGNED64 (struct) b3SolverBody +B3_ATTRIBUTE_ALIGNED16 (struct) b3SolverBody { B3_DECLARE_ALIGNED_ALLOCATOR(); b3Transform m_worldTransform; @@ -125,6 +126,8 @@ B3_ATTRIBUTE_ALIGNED64 (struct) b3SolverBody int m_originalBodyIndex; }; + int padding[3]; + void setWorldTransform(const b3Transform& worldTransform) { diff --git a/src/Bullet3OpenCL/RigidBody/b3GpuPgsJacobiSolver.cpp b/src/Bullet3OpenCL/RigidBody/b3GpuPgsJacobiSolver.cpp index f964a61ae..76a759f70 100644 --- a/src/Bullet3OpenCL/RigidBody/b3GpuPgsJacobiSolver.cpp +++ b/src/Bullet3OpenCL/RigidBody/b3GpuPgsJacobiSolver.cpp @@ -25,6 +25,10 @@ struct b3GpuPgsJacobiSolverInternalData cl_device_id m_device; cl_command_queue m_queue; cl_kernel m_solveJointConstraintRowsKernels; + b3OpenCLArray* m_gpuSolverConstraintRows; + b3OpenCLArray* m_gpuSolverBodies; + b3OpenCLArray* m_gpuBatchConstraints; + b3OpenCLArray* m_gpuConstraintRows; }; @@ -36,6 +40,12 @@ b3GpuPgsJacobiSolver::b3GpuPgsJacobiSolver (cl_context ctx, cl_device_id device, m_gpuData->m_device = device; m_gpuData->m_queue = queue; + m_gpuData->m_gpuSolverConstraintRows = new b3OpenCLArray(ctx,queue); + m_gpuData->m_gpuSolverBodies = new b3OpenCLArray(m_gpuData->m_context,m_gpuData->m_queue); + m_gpuData->m_gpuBatchConstraints = new b3OpenCLArray(m_gpuData->m_context,m_gpuData->m_queue); + m_gpuData->m_gpuConstraintRows = new b3OpenCLArray(m_gpuData->m_context,m_gpuData->m_queue); + + cl_int errNum=0; { @@ -53,6 +63,11 @@ b3GpuPgsJacobiSolver::~b3GpuPgsJacobiSolver () { clReleaseKernel(m_gpuData->m_solveJointConstraintRowsKernels); + delete m_gpuData->m_gpuSolverConstraintRows; + delete m_gpuData->m_gpuSolverBodies; + delete m_gpuData->m_gpuBatchConstraints; + delete m_gpuData->m_gpuConstraintRows; + delete m_gpuData; } @@ -419,16 +434,51 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlySetup(b3RigidBodyCL* bodie +///a straight copy from GPU/OpenCL kernel, for debugging +__inline void internalApplyImpulse( b3SolverBody* body, const b3Vector3& linearComponent, const b3Vector3& angularComponent,float impulseMagnitude) +{ + body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor; + body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor); +} +void resolveSingleConstraintRowGeneric2( b3SolverBody* body1, b3SolverBody* body2, b3SolverConstraint* c) +{ + float deltaImpulse = c->m_rhs-c->m_appliedImpulse.x*c->m_cfm; + float deltaVel1Dotn = b3Dot(c->m_contactNormal,body1->m_deltaLinearVelocity) + b3Dot(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity); + float deltaVel2Dotn = -b3Dot(c->m_contactNormal,body2->m_deltaLinearVelocity) + b3Dot(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity); + + deltaImpulse -= deltaVel1Dotn*c->m_jacDiagABInv; + deltaImpulse -= deltaVel2Dotn*c->m_jacDiagABInv; + + float sum = c->m_appliedImpulse.x + deltaImpulse; + if (sum < c->m_lowerLimit) + { + deltaImpulse = c->m_lowerLimit-c->m_appliedImpulse.x; + c->m_appliedImpulse.x = c->m_lowerLimit; + } + else if (sum > c->m_upperLimit) + { + deltaImpulse = c->m_upperLimit-c->m_appliedImpulse.x; + c->m_appliedImpulse.x = c->m_upperLimit; + } + else + { + c->m_appliedImpulse.x = sum; + } + + internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse); + internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse); + +} b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyIterations(b3TypedConstraint** cpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal) { - bool useCpu = false; + bool useb3PgsJacobiSolver = false; bool createBatches = batches.size()==0; - if (useCpu) + if (useb3PgsJacobiSolver) { return b3PgsJacobiSolver::solveGroupCacheFriendlyIterations(cpuConstraints,numConstraints,infoGlobal); } else @@ -449,20 +499,19 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyIterations(b3TypedConstrai } int maxIterations = infoGlobal.m_numIterations; bool useBatching = true; + bool useGpu=false; + if (useBatching ) { - b3OpenCLArray gpuSolverConstraintRows(m_gpuData->m_context,m_gpuData->m_queue); - gpuSolverConstraintRows.copyFromHost(m_tmpSolverNonContactConstraintPool); - - b3OpenCLArray gpuSolverBodies(m_gpuData->m_context,m_gpuData->m_queue); - gpuSolverBodies.copyFromHost(m_tmpSolverBodyPool); -// gpuSolverBodies.copyToHost(m_tmpSolverBodyPool); + if (useGpu) + { + B3_PROFILE("copy from host"); + m_gpuData->m_gpuSolverConstraintRows->copyFromHost(m_tmpSolverNonContactConstraintPool); + m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool); + m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints); + m_gpuData->m_gpuConstraintRows->copyFromHost(m_tmpSolverNonContactConstraintPool); + } - b3OpenCLArray gpuBatchConstraints(m_gpuData->m_context,m_gpuData->m_queue); - gpuBatchConstraints.copyFromHost(batchConstraints); - - b3OpenCLArray gpuConstraintRows(m_gpuData->m_context,m_gpuData->m_queue); - gpuConstraintRows.copyFromHost(m_tmpSolverNonContactConstraintPool); for ( int iteration = 0 ; iteration< maxIterations ; iteration++) @@ -475,18 +524,20 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyIterations(b3TypedConstrai { int numConstraintsInBatch = batches[bb]; - bool useGpu=false; + if (useGpu) { + B3_PROFILE("b3LauncherCL"); b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_solveJointConstraintRowsKernels); - launcher.setBuffer(gpuSolverBodies.getBufferCL()); - launcher.setBuffer(gpuBatchConstraints.getBufferCL()); - launcher.setBuffer(gpuConstraintRows.getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL()); launcher.setConst(batchOffset); launcher.setConst(constraintOffset); launcher.setConst(numConstraintsInBatch); launcher.launch1D(numConstraintsInBatch); + clFinish(m_gpuData->m_queue); } else { @@ -503,8 +554,8 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyIterations(b3TypedConstrai { // b3SolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[c.m_constraintRowOffset+jj]; -// resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint); - resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint); + resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint); + //resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint); } } @@ -514,9 +565,16 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyIterations(b3TypedConstrai } }//for (int iteration... - gpuSolverBodies.copyToHost(m_tmpSolverBodyPool); - clFinish(m_gpuData->m_queue); - printf(",,\n"); + if (useGpu) + { + B3_PROFILE("copy to host"); + m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); + } + //int sz = sizeof(b3SolverBody); + //printf("cpu sizeof(b3SolverBody)=%d\n",sz); + + + } else diff --git a/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp b/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp index 0fa1f26ec..38eb0ab3c 100644 --- a/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp +++ b/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp @@ -237,8 +237,8 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime) m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumRigidBodies(),&hostBodies[0],&hostInertias[0],0,0,numJoints, joints); } - //gpuBodies.copyFromHost(hostBodies); - printf("...\n"); + gpuBodies.copyFromHost(hostBodies); + } if (numContacts) diff --git a/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl b/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl index 6a197b477..eb3f6040b 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl +++ b/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl @@ -1,4 +1,5 @@ + typedef float4 Quaternion; typedef struct @@ -38,6 +39,8 @@ typedef struct void* m_originalBody; int m_originalBodyIndex; }; + int padding[3]; + } b3SolverBody; @@ -131,11 +134,8 @@ void resolveSingleConstraintRowGeneric(__global b3SolverBody* body1, __global b3 c->m_appliedImpulse.x = sum; } - if (body1->m_invMass.x) - internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse); - - if (body2->m_invMass.x) - internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse); + internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse); + internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse); } diff --git a/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h b/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h index 1a25292f8..d43d2979b 100644 --- a/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h +++ b/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h @@ -1,6 +1,7 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project static const char* solveConstraintRowsCL= \ "\n" +"\n" "typedef float4 Quaternion;\n" "\n" "typedef struct\n" @@ -40,6 +41,8 @@ static const char* solveConstraintRowsCL= \ " void* m_originalBody;\n" " int m_originalBodyIndex;\n" " };\n" +" int padding[3];\n" +"\n" "} b3SolverBody;\n" "\n" "\n" @@ -133,11 +136,8 @@ static const char* solveConstraintRowsCL= \ " c->m_appliedImpulse.x = sum;\n" " }\n" "\n" -" if (body1->m_invMass.x)\n" -" internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);\n" -" \n" -" if (body2->m_invMass.x)\n" -" internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);\n" +" internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);\n" +" internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);\n" "\n" "}\n" "\n"