parallel solver: various changes
- threading: adding btSequentialImpulseConstraintSolverMt - task scheduler: added parallelSum so that parallel solver can compute residuals - CommonRigidBodyMTBase: add slider for solver least squares residual and allow multithreading without needing OpenMP, TBB, or PPL - taskScheduler: don't wait for workers to sleep/signal at the end of each parallel block - parallel solver: convertContacts split into an allocContactConstraints and setupContactConstraints stage, the latter of which is done in parallel - parallel solver: rolling friction is now interleaved along with normal friction - parallel solver: batchified split impulse solving + some cleanup - parallel solver: sorting batches from largest to smallest - parallel solver: added parallel batch creation - parallel solver: added warmstartingWriteBackContacts func + other cleanup - task scheduler: truncate low bits to preserve determinism with parallelSum - parallel solver: reducing dynamic mem allocs and trying to parallelize more of the batch setup - parallel solver: parallelize updating constraint batch ids for merging - parallel solver: adding debug visualization - task scheduler: make TBB task scheduler parallelSum deterministic - parallel solver: split batch gen code into separate file; allow selection of batch gen method - task scheduler: add sleepWorkerThreadsHint() at end of simulation - parallel solver: added grain size per phase - task Scheduler: fix for strange threading issue; also no need for main thread to wait for workers to sleep - base constraint solver: break out joint setup into separate function for profiling/overriding - parallel solver: allow different batching method for contacts vs joints - base constraint solver: add convertJoint and convertBodies to make it possible to parallelize joint and body conversion - parallel solver: convert joints and bodies in parallel now - parallel solver: speed up batch creation with run-length encoding - parallel solver: batch gen: run-length expansion in parallel; collect constraint info in parallel - parallel solver: adding spatial grid batching method - parallel solver: enhancements to spatial grid batching - sequential solver: moving code for writing back into functions that derived classes can call - parallel solver: do write back of bodies and joints in parallel - parallel solver: removed all batching methods except for spatial grid (others were ineffective) - parallel solver: added 2D or 3D grid batching options; and a bit of cleanup - move btDefaultTaskScheduler into LinearMath project
This commit is contained in:
@@ -453,6 +453,33 @@ void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBod
|
||||
#endif// #if BT_THREADSAFE
|
||||
}
|
||||
|
||||
btScalar btParallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body )
|
||||
{
|
||||
#if BT_THREADSAFE
|
||||
|
||||
#if BT_DETECT_BAD_THREAD_INDEX
|
||||
if ( !btThreadsAreRunning() )
|
||||
{
|
||||
// clear out thread ids
|
||||
for ( int i = 0; i < BT_MAX_THREAD_COUNT; ++i )
|
||||
{
|
||||
gDebugThreadIds[ i ] = kInvalidThreadId;
|
||||
}
|
||||
}
|
||||
#endif // #if BT_DETECT_BAD_THREAD_INDEX
|
||||
|
||||
btAssert( gBtTaskScheduler != NULL ); // call btSetTaskScheduler() with a valid task scheduler first!
|
||||
return gBtTaskScheduler->parallelSum( iBegin, iEnd, grainSize, body );
|
||||
|
||||
#else // #if BT_THREADSAFE
|
||||
|
||||
// non-parallel version of btParallelSum
|
||||
btAssert( !"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE" );
|
||||
return body.sumLoop( iBegin, iEnd );
|
||||
|
||||
#endif //#else // #if BT_THREADSAFE
|
||||
}
|
||||
|
||||
|
||||
///
|
||||
/// btTaskSchedulerSequential -- non-threaded implementation of task scheduler
|
||||
@@ -470,6 +497,11 @@ public:
|
||||
BT_PROFILE( "parallelFor_sequential" );
|
||||
body.forLoop( iBegin, iEnd );
|
||||
}
|
||||
virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
|
||||
{
|
||||
BT_PROFILE( "parallelSum_sequential" );
|
||||
return body.sumLoop( iBegin, iEnd );
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -514,11 +546,25 @@ public:
|
||||
#pragma omp parallel for schedule( static, 1 )
|
||||
for ( int i = iBegin; i < iEnd; i += grainSize )
|
||||
{
|
||||
BT_PROFILE( "OpenMP_job" );
|
||||
BT_PROFILE( "OpenMP_forJob" );
|
||||
body.forLoop( i, ( std::min )( i + grainSize, iEnd ) );
|
||||
}
|
||||
btPopThreadsAreRunning();
|
||||
}
|
||||
virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
|
||||
{
|
||||
BT_PROFILE( "parallelFor_OpenMP" );
|
||||
btPushThreadsAreRunning();
|
||||
btScalar sum = btScalar( 0 );
|
||||
#pragma omp parallel for schedule( static, 1 ) reduction(+:sum)
|
||||
for ( int i = iBegin; i < iEnd; i += grainSize )
|
||||
{
|
||||
BT_PROFILE( "OpenMP_sumJob" );
|
||||
sum += body.sumLoop( i, ( std::min )( i + grainSize, iEnd ) );
|
||||
}
|
||||
btPopThreadsAreRunning();
|
||||
return sum;
|
||||
}
|
||||
};
|
||||
#endif // #if BT_USE_OPENMP && BT_THREADSAFE
|
||||
|
||||
@@ -571,22 +617,21 @@ public:
|
||||
btResetThreadIndexCounter();
|
||||
}
|
||||
}
|
||||
struct BodyAdapter
|
||||
struct ForBodyAdapter
|
||||
{
|
||||
const btIParallelForBody* mBody;
|
||||
|
||||
ForBodyAdapter( const btIParallelForBody* body ) : mBody( body ) {}
|
||||
void operator()( const tbb::blocked_range<int>& range ) const
|
||||
{
|
||||
BT_PROFILE( "TBB_job" );
|
||||
BT_PROFILE( "TBB_forJob" );
|
||||
mBody->forLoop( range.begin(), range.end() );
|
||||
}
|
||||
};
|
||||
virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
|
||||
{
|
||||
BT_PROFILE( "parallelFor_TBB" );
|
||||
// TBB dispatch
|
||||
BodyAdapter tbbBody;
|
||||
tbbBody.mBody = &body;
|
||||
ForBodyAdapter tbbBody( &body );
|
||||
btPushThreadsAreRunning();
|
||||
tbb::parallel_for( tbb::blocked_range<int>( iBegin, iEnd, grainSize ),
|
||||
tbbBody,
|
||||
@@ -594,6 +639,29 @@ public:
|
||||
);
|
||||
btPopThreadsAreRunning();
|
||||
}
|
||||
struct SumBodyAdapter
|
||||
{
|
||||
const btIParallelSumBody* mBody;
|
||||
btScalar mSum;
|
||||
|
||||
SumBodyAdapter( const btIParallelSumBody* body ) : mBody( body ), mSum( btScalar( 0 ) ) {}
|
||||
SumBodyAdapter( const SumBodyAdapter& src, tbb::split ) : mBody( src.mBody ), mSum( btScalar( 0 ) ) {}
|
||||
void join( const SumBodyAdapter& src ) { mSum += src.mSum; }
|
||||
void operator()( const tbb::blocked_range<int>& range )
|
||||
{
|
||||
BT_PROFILE( "TBB_sumJob" );
|
||||
mSum += mBody->sumLoop( range.begin(), range.end() );
|
||||
}
|
||||
};
|
||||
virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
|
||||
{
|
||||
BT_PROFILE( "parallelSum_TBB" );
|
||||
SumBodyAdapter tbbBody( &body );
|
||||
btPushThreadsAreRunning();
|
||||
tbb::parallel_deterministic_reduce( tbb::blocked_range<int>( iBegin, iEnd, grainSize ), tbbBody );
|
||||
btPopThreadsAreRunning();
|
||||
return tbbBody.mSum;
|
||||
}
|
||||
};
|
||||
#endif // #if BT_USE_TBB && BT_THREADSAFE
|
||||
|
||||
@@ -605,6 +673,7 @@ public:
|
||||
class btTaskSchedulerPPL : public btITaskScheduler
|
||||
{
|
||||
int m_numThreads;
|
||||
concurrency::combinable<btScalar> m_sum; // for parallelSum
|
||||
public:
|
||||
btTaskSchedulerPPL() : btITaskScheduler( "PPL" )
|
||||
{
|
||||
@@ -644,15 +713,16 @@ public:
|
||||
btResetThreadIndexCounter();
|
||||
}
|
||||
}
|
||||
struct BodyAdapter
|
||||
struct ForBodyAdapter
|
||||
{
|
||||
const btIParallelForBody* mBody;
|
||||
int mGrainSize;
|
||||
int mIndexEnd;
|
||||
|
||||
ForBodyAdapter( const btIParallelForBody* body, int grainSize, int end ) : mBody( body ), mGrainSize( grainSize ), mIndexEnd( end ) {}
|
||||
void operator()( int i ) const
|
||||
{
|
||||
BT_PROFILE( "PPL_job" );
|
||||
BT_PROFILE( "PPL_forJob" );
|
||||
mBody->forLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) );
|
||||
}
|
||||
};
|
||||
@@ -660,10 +730,7 @@ public:
|
||||
{
|
||||
BT_PROFILE( "parallelFor_PPL" );
|
||||
// PPL dispatch
|
||||
BodyAdapter pplBody;
|
||||
pplBody.mBody = &body;
|
||||
pplBody.mGrainSize = grainSize;
|
||||
pplBody.mIndexEnd = iEnd;
|
||||
ForBodyAdapter pplBody( &body, grainSize, iEnd );
|
||||
btPushThreadsAreRunning();
|
||||
// note: MSVC 2010 doesn't support partitioner args, so avoid them
|
||||
concurrency::parallel_for( iBegin,
|
||||
@@ -673,6 +740,36 @@ public:
|
||||
);
|
||||
btPopThreadsAreRunning();
|
||||
}
|
||||
struct SumBodyAdapter
|
||||
{
|
||||
const btIParallelSumBody* mBody;
|
||||
concurrency::combinable<btScalar>* mSum;
|
||||
int mGrainSize;
|
||||
int mIndexEnd;
|
||||
|
||||
SumBodyAdapter( const btIParallelSumBody* body, concurrency::combinable<btScalar>* sum, int grainSize, int end ) : mBody( body ), mSum(sum), mGrainSize( grainSize ), mIndexEnd( end ) {}
|
||||
void operator()( int i ) const
|
||||
{
|
||||
BT_PROFILE( "PPL_sumJob" );
|
||||
mSum->local() += mBody->sumLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) );
|
||||
}
|
||||
};
|
||||
static btScalar sumFunc( btScalar a, btScalar b ) { return a + b; }
|
||||
virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
|
||||
{
|
||||
BT_PROFILE( "parallelSum_PPL" );
|
||||
m_sum.clear();
|
||||
SumBodyAdapter pplBody( &body, &m_sum, grainSize, iEnd );
|
||||
btPushThreadsAreRunning();
|
||||
// note: MSVC 2010 doesn't support partitioner args, so avoid them
|
||||
concurrency::parallel_for( iBegin,
|
||||
iEnd,
|
||||
grainSize,
|
||||
pplBody
|
||||
);
|
||||
btPopThreadsAreRunning();
|
||||
return m_sum.combine( sumFunc );
|
||||
}
|
||||
};
|
||||
#endif // #if BT_USE_PPL && BT_THREADSAFE
|
||||
|
||||
|
||||
Reference in New Issue
Block a user