parallel solver: various changes

- threading: adding btSequentialImpulseConstraintSolverMt - task scheduler: added parallelSum so that parallel solver can compute residuals - CommonRigidBodyMTBase: add slider for solver least squares residual and allow multithreading without needing OpenMP, TBB, or PPL - taskScheduler: don't wait for workers to sleep/signal at the end of each parallel block - parallel solver: convertContacts split into an allocContactConstraints and setupContactConstraints stage, the latter of which is done in parallel - parallel solver: rolling friction is now interleaved along with normal friction - parallel solver: batchified split impulse solving + some cleanup - parallel solver: sorting batches from largest to smallest - parallel solver: added parallel batch creation - parallel solver: added warmstartingWriteBackContacts func + other cleanup - task scheduler: truncate low bits to preserve determinism with parallelSum - parallel solver: reducing dynamic mem allocs and trying to parallelize more of the batch setup - parallel solver: parallelize updating constraint batch ids for merging - parallel solver: adding debug visualization - task scheduler: make TBB task scheduler parallelSum deterministic - parallel solver: split batch gen code into separate file; allow selection of batch gen method - task scheduler: add sleepWorkerThreadsHint() at end of simulation - parallel solver: added grain size per phase - task Scheduler: fix for strange threading issue; also no need for main thread to wait for workers to sleep - base constraint solver: break out joint setup into separate function for profiling/overriding - parallel solver: allow different batching method for contacts vs joints - base constraint solver: add convertJoint and convertBodies to make it possible to parallelize joint and body conversion - parallel solver: convert joints and bodies in parallel now - parallel solver: speed up batch creation with run-length encoding - parallel solver: batch gen: run-length expansion in parallel; collect constraint info in parallel - parallel solver: adding spatial grid batching method - parallel solver: enhancements to spatial grid batching - sequential solver: moving code for writing back into functions that derived classes can call - parallel solver: do write back of bodies and joints in parallel - parallel solver: removed all batching methods except for spatial grid (others were ineffective) - parallel solver: added 2D or 3D grid batching options; and a bit of cleanup - move btDefaultTaskScheduler into LinearMath project
2017-06-04 17:57:25 -07:00
parent 94bc897067
commit b8720f2161
25 changed files with 5236 additions and 767 deletions
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
@@ -325,3 +325,14 @@ void btDiscreteDynamicsWorldMt::integrateTransforms( btScalar timeStep )
    }
 }

+
+int	btDiscreteDynamicsWorldMt::stepSimulation( btScalar timeStep, int maxSubSteps, btScalar fixedTimeStep )
+{
+    int numSubSteps = btDiscreteDynamicsWorld::stepSimulation(timeStep, maxSubSteps, fixedTimeStep);
+    if (btITaskScheduler* scheduler = btGetTaskScheduler())
+    {
+        // tell Bullet's threads to sleep, so other threads can run
+        scheduler->sleepWorkerThreadsHint();
+    }
+    return numSubSteps;
+}
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
@@ -129,6 +129,8 @@ public:
        btCollisionConfiguration* collisionConfiguration
    );
 	virtual ~btDiscreteDynamicsWorldMt();
+
+    virtual int	stepSimulation( btScalar timeStep, int maxSubSteps, btScalar fixedTimeStep ) BT_OVERRIDE;
 };

 #endif //BT_DISCRETE_DYNAMICS_WORLD_H
--- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
+++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
@@ -22,6 +22,7 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletCollision/CollisionDispatch/btCollisionWorld.h"
 #include "BulletDynamics/ConstraintSolver/btTypedConstraint.h"
+#include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h"  // for s_minimumContactManifoldsForBatching

 //#include <stdio.h>
 #include "LinearMath/btQuickprof.h"
@@ -589,14 +590,52 @@ struct UpdateIslandDispatcher : public btIParallelForBody
    }
 };

+
 void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback )
 {
    BT_PROFILE( "parallelIslandDispatch" );
-    int grainSize = 1;  // iterations per task
+    //
+    // if there are islands with many contacts, it may be faster to submit these
+    // large islands *serially* to a single parallel constraint solver, and then later
+    // submit the remaining smaller islands in parallel to multiple sequential solvers.
+    //
+    // Some task schedulers do not deal well with nested parallelFor loops. One implementation
+    // of OpenMP was actually slower than doing everything single-threaded. Intel TBB
+    // on the other hand, seems to do a pretty respectable job with it.
+    //
+    // When solving islands in parallel, the worst case performance happens when there
+    // is one very large island and then perhaps a smattering of very small
+    // islands -- one worker thread takes the large island and the remaining workers
+    // tear through the smaller islands and then sit idle waiting for the first worker
+    // to finish. Solving islands in parallel works best when there are numerous small
+    // islands, roughly equal in size.
+    //
+    // By contrast, the other approach -- the parallel constraint solver -- is only
+    // able to deliver a worthwhile speedup when the island is large. For smaller islands,
+    // it is difficult to extract a useful amount of parallelism -- the overhead of grouping
+    // the constraints into batches and sending the batches to worker threads can nullify
+    // any gains from parallelism.
+    //
+
    UpdateIslandDispatcher dispatcher;
    dispatcher.islandsPtr = islandsPtr;
    dispatcher.callback = callback;
-    btParallelFor( 0, islandsPtr->size(), grainSize, dispatcher );
+    // We take advantage of the fact the islands are sorted in order of decreasing size
+    int iBegin = 0;
+    while (iBegin < islandsPtr->size())
+    {
+        btSimulationIslandManagerMt::Island* island = (*islandsPtr)[ iBegin ];
+        if (island->manifoldArray.size() < btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching)
+        {
+            // OK to submit the rest of the array in parallel
+            break;
+        }
+        ++iBegin;
+    }
+    // serial dispatch for large islands (if any)
+    dispatcher.forLoop(0, iBegin);
+    // parallel dispatch for rest
+    btParallelFor( iBegin, islandsPtr->size(), 1, dispatcher );
 }


--- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
+++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
@@ -106,5 +106,7 @@ public:
    }
 };

+extern int gLargeIslandManifoldCount;
+
 #endif //BT_SIMULATION_ISLAND_MANAGER_H