parallel solver: various changes

- threading: adding btSequentialImpulseConstraintSolverMt
 - task scheduler: added parallelSum so that parallel solver can compute residuals
 - CommonRigidBodyMTBase: add slider for solver least squares residual and allow multithreading without needing OpenMP, TBB, or PPL
 - taskScheduler: don't wait for workers to sleep/signal at the end of each parallel block
 - parallel solver: convertContacts split into an allocContactConstraints and setupContactConstraints stage, the latter of which is done in parallel
 - parallel solver: rolling friction is now interleaved along with normal friction
 - parallel solver: batchified split impulse solving + some cleanup
 - parallel solver: sorting batches from largest to smallest
 - parallel solver: added parallel batch creation
 - parallel solver: added warmstartingWriteBackContacts func + other cleanup
 - task scheduler: truncate low bits to preserve determinism with parallelSum
 - parallel solver: reducing dynamic mem allocs and trying to parallelize more of the batch setup
 - parallel solver: parallelize updating constraint batch ids for merging
 - parallel solver: adding debug visualization
 - task scheduler: make TBB task scheduler parallelSum deterministic
 - parallel solver: split batch gen code into separate file; allow selection of batch gen method
 - task scheduler: add sleepWorkerThreadsHint() at end of simulation
 - parallel solver: added grain size per phase
 - task Scheduler: fix for strange threading issue; also no need for main thread to wait for workers to sleep
 - base constraint solver: break out joint setup into separate function for profiling/overriding
 - parallel solver: allow different batching method for contacts vs joints
 - base constraint solver: add convertJoint and convertBodies to make it possible to parallelize joint and body conversion
 - parallel solver: convert joints and bodies in parallel now
 - parallel solver: speed up batch creation with run-length encoding
 - parallel solver: batch gen: run-length expansion in parallel; collect constraint info in parallel
 - parallel solver: adding spatial grid batching method
 - parallel solver: enhancements to spatial grid batching
 - sequential solver: moving code for writing back into functions that derived classes can call
 - parallel solver: do write back of bodies and joints in parallel
 - parallel solver: removed all batching methods except for spatial grid (others were ineffective)
 - parallel solver: added 2D or 3D grid batching options; and a bit of cleanup
 - move btDefaultTaskScheduler into LinearMath project
This commit is contained in:
Lunkhound
2017-06-04 17:57:25 -07:00
parent 94bc897067
commit b8720f2161
25 changed files with 5236 additions and 767 deletions

View File

@@ -1,448 +0,0 @@
#include "LinearMath/btTransform.h"
#include "../Utils/b3Clock.h"
#include "LinearMath/btAlignedObjectArray.h"
#include "LinearMath/btThreads.h"
#include "LinearMath/btQuickprof.h"
#include <stdio.h>
#include <algorithm>
typedef void( *btThreadFunc )( void* userPtr, void* lsMemory );
typedef void* ( *btThreadLocalStorageFunc )();
#if BT_THREADSAFE
#if defined( _WIN32 )
#include "b3Win32ThreadSupport.h"
b3ThreadSupportInterface* createThreadSupport( int numThreads, btThreadFunc threadFunc, btThreadLocalStorageFunc localStoreFunc, const char* uniqueName )
{
b3Win32ThreadSupport::Win32ThreadConstructionInfo constructionInfo( uniqueName, threadFunc, localStoreFunc, numThreads );
//constructionInfo.m_priority = 0; // highest priority (the default) -- can cause erratic performance when numThreads > numCores
// we don't want worker threads to be higher priority than the main thread or the main thread could get
// totally shut out and unable to tell the workers to stop
constructionInfo.m_priority = -1; // normal priority
b3Win32ThreadSupport* threadSupport = new b3Win32ThreadSupport( constructionInfo );
return threadSupport;
}
#else // #if defined( _WIN32 )
#include "b3PosixThreadSupport.h"
b3ThreadSupportInterface* createThreadSupport( int numThreads, btThreadFunc threadFunc, btThreadLocalStorageFunc localStoreFunc, const char* uniqueName)
{
b3PosixThreadSupport::ThreadConstructionInfo constructionInfo( uniqueName, threadFunc, localStoreFunc, numThreads );
b3ThreadSupportInterface* threadSupport = new b3PosixThreadSupport( constructionInfo );
return threadSupport;
}
#endif // #else // #if defined( _WIN32 )
///
/// getNumHardwareThreads()
///
///
/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
///
#if __cplusplus >= 201103L
#include <thread>
int getNumHardwareThreads()
{
return std::thread::hardware_concurrency();
}
#elif defined( _WIN32 )
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
int getNumHardwareThreads()
{
// caps out at 32
SYSTEM_INFO info;
GetSystemInfo( &info );
return info.dwNumberOfProcessors;
}
#else
int getNumHardwareThreads()
{
return 0; // don't know
}
#endif
struct WorkerThreadStatus
{
enum Type
{
kInvalid,
kWaitingForWork,
kWorking,
kSleeping,
};
};
struct IJob
{
virtual void executeJob() = 0;
};
class ParallelForJob : public IJob
{
const btIParallelForBody* mBody;
int mBegin;
int mEnd;
public:
ParallelForJob()
{
mBody = NULL;
mBegin = 0;
mEnd = 0;
}
void init( int iBegin, int iEnd, const btIParallelForBody& body )
{
mBody = &body;
mBegin = iBegin;
mEnd = iEnd;
}
virtual void executeJob() BT_OVERRIDE
{
BT_PROFILE( "executeJob" );
// call the functor body to do the work
mBody->forLoop( mBegin, mEnd );
}
};
struct JobContext
{
JobContext()
{
m_queueLock = NULL;
m_headIndex = 0;
m_tailIndex = 0;
m_workersShouldCheckQueue = false;
m_useSpinMutex = false;
}
b3CriticalSection* m_queueLock;
btSpinMutex m_mutex;
volatile bool m_workersShouldCheckQueue;
btAlignedObjectArray<IJob*> m_jobQueue;
bool m_queueIsEmpty;
int m_tailIndex;
int m_headIndex;
bool m_useSpinMutex;
void lockQueue()
{
if ( m_useSpinMutex )
{
m_mutex.lock();
}
else
{
m_queueLock->lock();
}
}
void unlockQueue()
{
if ( m_useSpinMutex )
{
m_mutex.unlock();
}
else
{
m_queueLock->unlock();
}
}
void clearQueue()
{
lockQueue();
m_headIndex = 0;
m_tailIndex = 0;
m_queueIsEmpty = true;
unlockQueue();
m_jobQueue.resizeNoInitialize( 0 );
}
void submitJob( IJob* job )
{
m_jobQueue.push_back( job );
lockQueue();
m_tailIndex++;
m_queueIsEmpty = false;
unlockQueue();
}
IJob* consumeJob()
{
if ( m_queueIsEmpty )
{
// lock free path. even if this is taken erroneously it isn't harmful
return NULL;
}
IJob* job = NULL;
lockQueue();
if ( !m_queueIsEmpty )
{
job = m_jobQueue[ m_headIndex++ ];
if ( m_headIndex == m_tailIndex )
{
m_queueIsEmpty = true;
}
}
unlockQueue();
return job;
}
};
struct WorkerThreadLocalStorage
{
int threadId;
WorkerThreadStatus::Type status;
};
static void WorkerThreadFunc( void* userPtr, void* lsMemory )
{
BT_PROFILE( "WorkerThreadFunc" );
WorkerThreadLocalStorage* localStorage = (WorkerThreadLocalStorage*) lsMemory;
localStorage->status = WorkerThreadStatus::kWaitingForWork;
//printf( "WorkerThreadFunc: worker %d start working\n", localStorage->threadId );
JobContext* jobContext = (JobContext*) userPtr;
while ( jobContext->m_workersShouldCheckQueue )
{
if ( IJob* job = jobContext->consumeJob() )
{
localStorage->status = WorkerThreadStatus::kWorking;
job->executeJob();
localStorage->status = WorkerThreadStatus::kWaitingForWork;
}
else
{
// todo: spin wait a bit to avoid hammering the empty queue
}
}
//printf( "WorkerThreadFunc stop working\n" );
localStorage->status = WorkerThreadStatus::kSleeping;
// go idle
}
static void* WorkerThreadAllocFunc()
{
return new WorkerThreadLocalStorage;
}
class btTaskSchedulerDefault : public btITaskScheduler
{
JobContext m_jobContext;
b3ThreadSupportInterface* m_threadSupport;
btAlignedObjectArray<ParallelForJob> m_jobs;
btSpinMutex m_antiNestingLock; // prevent nested parallel-for
int m_numThreads;
int m_numWorkerThreads;
int m_numWorkersRunning;
public:
btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport")
{
m_threadSupport = NULL;
m_numThreads = getNumHardwareThreads();
// if can't detect number of cores,
if ( m_numThreads == 0 )
{
// take a guess
m_numThreads = 4;
}
m_numWorkerThreads = m_numThreads - 1;
m_numWorkersRunning = 0;
}
virtual ~btTaskSchedulerDefault()
{
shutdown();
}
void init()
{
int maxNumWorkerThreads = BT_MAX_THREAD_COUNT - 1;
m_threadSupport = createThreadSupport( maxNumWorkerThreads, WorkerThreadFunc, WorkerThreadAllocFunc, "TaskScheduler" );
m_jobContext.m_queueLock = m_threadSupport->createCriticalSection();
for ( int i = 0; i < maxNumWorkerThreads; i++ )
{
WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i );
btAssert( storage );
storage->threadId = i;
storage->status = WorkerThreadStatus::kSleeping;
}
setWorkersActive( false ); // no work for them yet
}
virtual void shutdown()
{
setWorkersActive( false );
waitForWorkersToSleep();
m_threadSupport->deleteCriticalSection( m_jobContext.m_queueLock );
m_jobContext.m_queueLock = NULL;
delete m_threadSupport;
m_threadSupport = NULL;
}
void setWorkersActive( bool active )
{
m_jobContext.m_workersShouldCheckQueue = active;
}
virtual int getMaxNumThreads() const BT_OVERRIDE
{
return BT_MAX_THREAD_COUNT;
}
virtual int getNumThreads() const BT_OVERRIDE
{
return m_numThreads;
}
virtual void setNumThreads( int numThreads ) BT_OVERRIDE
{
m_numThreads = btMax( btMin(numThreads, int(BT_MAX_THREAD_COUNT)), 1 );
m_numWorkerThreads = m_numThreads - 1;
}
void waitJobs()
{
BT_PROFILE( "waitJobs" );
// have the main thread work until the job queue is empty
for ( ;; )
{
if ( IJob* job = m_jobContext.consumeJob() )
{
job->executeJob();
}
else
{
break;
}
}
// done with jobs for now, tell workers to rest
setWorkersActive( false );
waitForWorkersToSleep();
}
void wakeWorkers()
{
BT_PROFILE( "wakeWorkers" );
btAssert( m_jobContext.m_workersShouldCheckQueue );
// tell each worker thread to start working
for ( int i = 0; i < m_numWorkerThreads; i++ )
{
m_threadSupport->runTask( B3_THREAD_SCHEDULE_TASK, &m_jobContext, i );
m_numWorkersRunning++;
}
}
void waitForWorkersToSleep()
{
BT_PROFILE( "waitForWorkersToSleep" );
while ( m_numWorkersRunning > 0 )
{
int iThread;
int threadStatus;
m_threadSupport->waitForResponse( &iThread, &threadStatus ); // wait for worker threads to finish working
m_numWorkersRunning--;
}
//m_threadSupport->waitForAllTasksToComplete();
for ( int i = 0; i < m_numWorkerThreads; i++ )
{
//m_threadSupport->waitForTaskCompleted( i );
WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i );
btAssert( storage );
btAssert( storage->status == WorkerThreadStatus::kSleeping );
}
}
virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
{
BT_PROFILE( "parallelFor_ThreadSupport" );
btAssert( iEnd >= iBegin );
btAssert( grainSize >= 1 );
int iterationCount = iEnd - iBegin;
if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() )
{
int jobCount = ( iterationCount + grainSize - 1 ) / grainSize;
btAssert( jobCount >= 2 ); // need more than one job for multithreading
if ( jobCount > m_jobs.size() )
{
m_jobs.resize( jobCount );
}
if ( jobCount > m_jobContext.m_jobQueue.capacity() )
{
m_jobContext.m_jobQueue.reserve( jobCount );
}
m_jobContext.clearQueue();
// prepare worker threads for incoming work
setWorkersActive( true );
wakeWorkers();
// submit all of the jobs
int iJob = 0;
for ( int i = iBegin; i < iEnd; i += grainSize )
{
btAssert( iJob < jobCount );
int iE = btMin( i + grainSize, iEnd );
ParallelForJob& job = m_jobs[ iJob ];
job.init( i, iE, body );
m_jobContext.submitJob( &job );
iJob++;
}
// put the main thread to work on emptying the job queue and then wait for all workers to finish
waitJobs();
m_antiNestingLock.unlock();
}
else
{
BT_PROFILE( "parallelFor_mainThread" );
// just run on main thread
body.forLoop( iBegin, iEnd );
}
}
};
btITaskScheduler* createDefaultTaskScheduler()
{
btTaskSchedulerDefault* ts = new btTaskSchedulerDefault();
ts->init();
return ts;
}
#else // #if BT_THREADSAFE
btITaskScheduler* createDefaultTaskScheduler()
{
return NULL;
}
#endif // #else // #if BT_THREADSAFE

View File

@@ -1,26 +0,0 @@
/*
Copyright (c) 2003-2014 Erwin Coumans http://bullet.googlecode.com
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef BT_TASK_SCHEDULER_H
#define BT_TASK_SCHEDULER_H
class btITaskScheduler;
btITaskScheduler* createDefaultTaskScheduler();
#endif // BT_TASK_SCHEDULER_H