parallel solver: various changes

- threading: adding btSequentialImpulseConstraintSolverMt - task scheduler: added parallelSum so that parallel solver can compute residuals - CommonRigidBodyMTBase: add slider for solver least squares residual and allow multithreading without needing OpenMP, TBB, or PPL - taskScheduler: don't wait for workers to sleep/signal at the end of each parallel block - parallel solver: convertContacts split into an allocContactConstraints and setupContactConstraints stage, the latter of which is done in parallel - parallel solver: rolling friction is now interleaved along with normal friction - parallel solver: batchified split impulse solving + some cleanup - parallel solver: sorting batches from largest to smallest - parallel solver: added parallel batch creation - parallel solver: added warmstartingWriteBackContacts func + other cleanup - task scheduler: truncate low bits to preserve determinism with parallelSum - parallel solver: reducing dynamic mem allocs and trying to parallelize more of the batch setup - parallel solver: parallelize updating constraint batch ids for merging - parallel solver: adding debug visualization - task scheduler: make TBB task scheduler parallelSum deterministic - parallel solver: split batch gen code into separate file; allow selection of batch gen method - task scheduler: add sleepWorkerThreadsHint() at end of simulation - parallel solver: added grain size per phase - task Scheduler: fix for strange threading issue; also no need for main thread to wait for workers to sleep - base constraint solver: break out joint setup into separate function for profiling/overriding - parallel solver: allow different batching method for contacts vs joints - base constraint solver: add convertJoint and convertBodies to make it possible to parallelize joint and body conversion - parallel solver: convert joints and bodies in parallel now - parallel solver: speed up batch creation with run-length encoding - parallel solver: batch gen: run-length expansion in parallel; collect constraint info in parallel - parallel solver: adding spatial grid batching method - parallel solver: enhancements to spatial grid batching - sequential solver: moving code for writing back into functions that derived classes can call - parallel solver: do write back of bodies and joints in parallel - parallel solver: removed all batching methods except for spatial grid (others were ineffective) - parallel solver: added 2D or 3D grid batching options; and a bit of cleanup - move btDefaultTaskScheduler into LinearMath project
2017-06-04 17:57:25 -07:00
parent 94bc897067
commit b8720f2161
25 changed files with 5236 additions and 767 deletions
--- a/src/LinearMath/btThreads.cpp
+++ b/src/LinearMath/btThreads.cpp
@@ -453,6 +453,33 @@ void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBod
 #endif// #if BT_THREADSAFE
 }

+btScalar btParallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body )
+{
+#if BT_THREADSAFE
+
+#if BT_DETECT_BAD_THREAD_INDEX
+    if ( !btThreadsAreRunning() )
+    {
+        // clear out thread ids
+        for ( int i = 0; i < BT_MAX_THREAD_COUNT; ++i )
+        {
+            gDebugThreadIds[ i ] = kInvalidThreadId;
+        }
+    }
+#endif // #if BT_DETECT_BAD_THREAD_INDEX
+
+    btAssert( gBtTaskScheduler != NULL );  // call btSetTaskScheduler() with a valid task scheduler first!
+    return gBtTaskScheduler->parallelSum( iBegin, iEnd, grainSize, body );
+
+#else // #if BT_THREADSAFE
+
+    // non-parallel version of btParallelSum
+    btAssert( !"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE" );
+    return body.sumLoop( iBegin, iEnd );
+
+#endif //#else // #if BT_THREADSAFE
+}
+

 ///
 /// btTaskSchedulerSequential -- non-threaded implementation of task scheduler
@@ -470,6 +497,11 @@ public:
        BT_PROFILE( "parallelFor_sequential" );
        body.forLoop( iBegin, iEnd );
    }
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelSum_sequential" );
+        return body.sumLoop( iBegin, iEnd );
+    }
 };


@@ -514,11 +546,25 @@ public:
 #pragma omp parallel for schedule( static, 1 )
        for ( int i = iBegin; i < iEnd; i += grainSize )
        {
-            BT_PROFILE( "OpenMP_job" );
+            BT_PROFILE( "OpenMP_forJob" );
            body.forLoop( i, ( std::min )( i + grainSize, iEnd ) );
        }
        btPopThreadsAreRunning();
    }
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelFor_OpenMP" );
+        btPushThreadsAreRunning();
+        btScalar sum = btScalar( 0 );
+#pragma omp parallel for schedule( static, 1 ) reduction(+:sum)
+        for ( int i = iBegin; i < iEnd; i += grainSize )
+        {
+            BT_PROFILE( "OpenMP_sumJob" );
+            sum += body.sumLoop( i, ( std::min )( i + grainSize, iEnd ) );
+        }
+        btPopThreadsAreRunning();
+        return sum;
+    }
 };
 #endif // #if BT_USE_OPENMP && BT_THREADSAFE

@@ -571,22 +617,21 @@ public:
            btResetThreadIndexCounter();
        }
    }
-    struct BodyAdapter
+    struct ForBodyAdapter
    {
        const btIParallelForBody* mBody;

+        ForBodyAdapter( const btIParallelForBody* body ) : mBody( body ) {}
        void operator()( const tbb::blocked_range<int>& range ) const
        {
-            BT_PROFILE( "TBB_job" );
+            BT_PROFILE( "TBB_forJob" );
            mBody->forLoop( range.begin(), range.end() );
        }
    };
    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
    {
        BT_PROFILE( "parallelFor_TBB" );
-        // TBB dispatch
-        BodyAdapter tbbBody;
-        tbbBody.mBody = &body;
+        ForBodyAdapter tbbBody( &body );
        btPushThreadsAreRunning();
        tbb::parallel_for( tbb::blocked_range<int>( iBegin, iEnd, grainSize ),
            tbbBody,
@@ -594,6 +639,29 @@ public:
        );
        btPopThreadsAreRunning();
    }
+    struct SumBodyAdapter
+    {
+        const btIParallelSumBody* mBody;
+        btScalar mSum;
+
+        SumBodyAdapter( const btIParallelSumBody* body ) : mBody( body ), mSum( btScalar( 0 ) ) {}
+        SumBodyAdapter( const SumBodyAdapter& src, tbb::split ) : mBody( src.mBody ), mSum( btScalar( 0 ) ) {}
+        void join( const SumBodyAdapter& src ) { mSum += src.mSum; }
+        void operator()( const tbb::blocked_range<int>& range )
+        {
+            BT_PROFILE( "TBB_sumJob" );
+            mSum += mBody->sumLoop( range.begin(), range.end() );
+        }
+    };
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelSum_TBB" );
+        SumBodyAdapter tbbBody( &body );
+        btPushThreadsAreRunning();
+        tbb::parallel_deterministic_reduce( tbb::blocked_range<int>( iBegin, iEnd, grainSize ), tbbBody );
+        btPopThreadsAreRunning();
+        return tbbBody.mSum;
+    }
 };
 #endif // #if BT_USE_TBB && BT_THREADSAFE

@@ -605,6 +673,7 @@ public:
 class btTaskSchedulerPPL : public btITaskScheduler
 {
    int m_numThreads;
+    concurrency::combinable<btScalar> m_sum;  // for parallelSum
 public:
    btTaskSchedulerPPL() : btITaskScheduler( "PPL" )
    {
@@ -644,15 +713,16 @@ public:
            btResetThreadIndexCounter();
        }
    }
-    struct BodyAdapter
+    struct ForBodyAdapter
    {
        const btIParallelForBody* mBody;
        int mGrainSize;
        int mIndexEnd;

+        ForBodyAdapter( const btIParallelForBody* body, int grainSize, int end ) : mBody( body ), mGrainSize( grainSize ), mIndexEnd( end ) {}
        void operator()( int i ) const
        {
-            BT_PROFILE( "PPL_job" );
+            BT_PROFILE( "PPL_forJob" );
            mBody->forLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) );
        }
    };
@@ -660,10 +730,7 @@ public:
    {
        BT_PROFILE( "parallelFor_PPL" );
        // PPL dispatch
-        BodyAdapter pplBody;
-        pplBody.mBody = &body;
-        pplBody.mGrainSize = grainSize;
-        pplBody.mIndexEnd = iEnd;
+        ForBodyAdapter pplBody( &body, grainSize, iEnd );
        btPushThreadsAreRunning();
        // note: MSVC 2010 doesn't support partitioner args, so avoid them
        concurrency::parallel_for( iBegin,
@@ -673,6 +740,36 @@ public:
        );
        btPopThreadsAreRunning();
    }
+    struct SumBodyAdapter
+    {
+        const btIParallelSumBody* mBody;
+        concurrency::combinable<btScalar>* mSum;
+        int mGrainSize;
+        int mIndexEnd;
+
+        SumBodyAdapter( const btIParallelSumBody* body, concurrency::combinable<btScalar>* sum, int grainSize, int end ) : mBody( body ), mSum(sum), mGrainSize( grainSize ), mIndexEnd( end ) {}
+        void operator()( int i ) const
+        {
+            BT_PROFILE( "PPL_sumJob" );
+            mSum->local() += mBody->sumLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) );
+        }
+    };
+    static btScalar sumFunc( btScalar a, btScalar b ) { return a + b; }
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelSum_PPL" );
+        m_sum.clear();
+        SumBodyAdapter pplBody( &body, &m_sum, grainSize, iEnd );
+        btPushThreadsAreRunning();
+        // note: MSVC 2010 doesn't support partitioner args, so avoid them
+        concurrency::parallel_for( iBegin,
+            iEnd,
+            grainSize,
+            pplBody
+        );
+        btPopThreadsAreRunning();
+        return m_sum.combine( sumFunc );
+    }
 };
 #endif // #if BT_USE_PPL && BT_THREADSAFE