From dfe184e8d3bcb7a81f3372c91388895c7e9218cf Mon Sep 17 00:00:00 2001 From: Lunkhound Date: Mon, 22 May 2017 00:47:11 -0700 Subject: [PATCH 1/7] Bullet 2 threading refactor: moved parallel-for calls into core libs --- CMakeLists.txt | 42 +- examples/Benchmarks/BenchmarkDemo.cpp | 5 +- examples/ExampleBrowser/CMakeLists.txt | 28 +- .../CommonRigidBodyMTBase.cpp | 576 ++++-------------- examples/MultiThreadedDemo/ParallelFor.h | 336 ---------- src/BulletCollision/CMakeLists.txt | 2 + .../btCollisionDispatcherMt.cpp | 164 +++++ .../btCollisionDispatcherMt.h | 39 ++ .../Dynamics/btDiscreteDynamicsWorldMt.cpp | 165 ++++- .../Dynamics/btDiscreteDynamicsWorldMt.h | 96 ++- .../Dynamics/btSimulationIslandManagerMt.cpp | 39 +- .../Dynamics/btSimulationIslandManagerMt.h | 3 +- src/LinearMath/btThreads.cpp | 304 ++++++++- src/LinearMath/btThreads.h | 60 ++ 14 files changed, 1012 insertions(+), 847 deletions(-) delete mode 100644 examples/MultiThreadedDemo/ParallelFor.h create mode 100644 src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.cpp create mode 100644 src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 6ad1a9706..65859e663 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,15 @@ OPTION(USE_DOUBLE_PRECISION "Use double precision" OFF) OPTION(USE_GRAPHICAL_BENCHMARK "Use Graphical Benchmark" ON) OPTION(BUILD_SHARED_LIBS "Use shared libraries" OFF) OPTION(USE_SOFT_BODY_MULTI_BODY_DYNAMICS_WORLD "Use btSoftMultiBodyDynamicsWorld" OFF) -OPTION(BULLET2_USE_THREAD_LOCKS "Build Bullet 2 libraries with mutex locking around certain operations" OFF) + +OPTION(BULLET2_USE_THREAD_LOCKS "Build Bullet 2 libraries with mutex locking around certain operations (required for multi-threading)" OFF) +IF (BULLET2_USE_THREAD_LOCKS) + OPTION(BULLET2_USE_OPEN_MP_MULTITHREADING "Build Bullet 2 with support for multi-threading with OpenMP (requires a compiler with OpenMP support)" OFF) + OPTION(BULLET2_USE_TBB_MULTITHREADING "Build Bullet 2 with support for multi-threading with Intel Threading Building Blocks (requires the TBB library to be already installed)" OFF) + IF (MSVC) + OPTION(BULLET2_USE_PPL_MULTITHREADING "Build Bullet 2 with support for multi-threading with Microsoft Parallel Patterns Library (requires MSVC compiler)" OFF) + ENDIF (MSVC) +ENDIF (BULLET2_USE_THREAD_LOCKS) OPTION(USE_MSVC_INCREMENTAL_LINKING "Use MSVC Incremental Linking" OFF) OPTION(USE_CUSTOM_VECTOR_MATH "Use custom vectormath library" OFF) @@ -208,6 +216,30 @@ IF(BULLET2_USE_THREAD_LOCKS) ENDIF (NOT MSVC) ENDIF (BULLET2_USE_THREAD_LOCKS) +IF (BULLET2_USE_OPEN_MP_MULTITHREADING) + ADD_DEFINITIONS("-DBT_USE_OPENMP=1") + IF (MSVC) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp") + ELSE (MSVC) + # GCC, Clang + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") + ENDIF (MSVC) +ENDIF (BULLET2_USE_OPEN_MP_MULTITHREADING) + +IF (BULLET2_USE_TBB_MULTITHREADING) + SET (BULLET2_TBB_INCLUDE_DIR "not found" CACHE PATH "Directory for Intel TBB includes.") + SET (BULLET2_TBB_LIB_DIR "not found" CACHE PATH "Directory for Intel TBB libraries.") + find_library(TBB_LIBRARY tbb PATHS ${BULLET2_TBB_LIB_DIR}) + find_library(TBBMALLOC_LIBRARY tbbmalloc PATHS ${BULLET2_TBB_LIB_DIR}) + ADD_DEFINITIONS("-DBT_USE_TBB=1") + INCLUDE_DIRECTORIES( ${BULLET2_TBB_INCLUDE_DIR} ) + LINK_LIBRARIES( ${TBB_LIBRARY} ${TBBMALLOC_LIBRARY} ) +ENDIF (BULLET2_USE_TBB_MULTITHREADING) + +IF (BULLET2_USE_PPL_MULTITHREADING) + ADD_DEFINITIONS("-DBT_USE_PPL=1") +ENDIF (BULLET2_USE_PPL_MULTITHREADING) + IF (WIN32) OPTION(USE_GLUT "Use Glut" ON) ADD_DEFINITIONS( -D_CRT_SECURE_NO_WARNINGS ) @@ -350,14 +382,6 @@ IF(BUILD_BULLET2_DEMOS) SUBDIRS(examples) ENDIF() - IF (BULLET2_USE_THREAD_LOCKS) - OPTION(BULLET2_MULTITHREADED_OPEN_MP_DEMO "Build Bullet 2 MultithreadedDemo using OpenMP (requires a compiler with OpenMP support)" OFF) - OPTION(BULLET2_MULTITHREADED_TBB_DEMO "Build Bullet 2 MultithreadedDemo using Intel Threading Building Blocks (requires the TBB library to be already installed)" OFF) - IF (MSVC) - OPTION(BULLET2_MULTITHREADED_PPL_DEMO "Build Bullet 2 MultithreadedDemo using Microsoft Parallel Patterns Library (requires MSVC compiler)" OFF) - ENDIF (MSVC) - ENDIF (BULLET2_USE_THREAD_LOCKS) - ENDIF(BUILD_BULLET2_DEMOS) diff --git a/examples/Benchmarks/BenchmarkDemo.cpp b/examples/Benchmarks/BenchmarkDemo.cpp index d9a1b3ef0..ec443de6b 100644 --- a/examples/Benchmarks/BenchmarkDemo.cpp +++ b/examples/Benchmarks/BenchmarkDemo.cpp @@ -32,7 +32,6 @@ subject to the following restrictions: #include "LinearMath/btAlignedObjectArray.h" #include "LinearMath/btTransform.h" -#include "../MultiThreadedDemo/ParallelFor.h" class btDynamicsWorld; @@ -230,7 +229,7 @@ public: } } - struct CastRaysLoopBody + struct CastRaysLoopBody : public btIParallelForBody { btCollisionWorld* mWorld; btRaycastBar2* mRaycasts; @@ -274,7 +273,7 @@ public: { CastRaysLoopBody rayLooper(cw, this); int grainSize = 20; // number of raycasts per task - parallelFor( 0, NUMRAYS, grainSize, rayLooper ); + btParallelFor( 0, NUMRAYS, grainSize, rayLooper ); } else #endif // USE_PARALLEL_RAYCASTS diff --git a/examples/ExampleBrowser/CMakeLists.txt b/examples/ExampleBrowser/CMakeLists.txt index 23c283fcb..4a4db93db 100644 --- a/examples/ExampleBrowser/CMakeLists.txt +++ b/examples/ExampleBrowser/CMakeLists.txt @@ -110,29 +110,6 @@ ELSE(WIN32) ENDIF(APPLE) ENDIF(WIN32) -IF (BULLET2_MULTITHREADED_OPEN_MP_DEMO) - ADD_DEFINITIONS("-DBT_USE_OPENMP=1") - IF (MSVC) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp") - ELSE (MSVC) - # GCC, Clang - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") - ENDIF (MSVC) -ENDIF (BULLET2_MULTITHREADED_OPEN_MP_DEMO) - -IF (BULLET2_MULTITHREADED_PPL_DEMO) - ADD_DEFINITIONS("-DBT_USE_PPL=1") -ENDIF (BULLET2_MULTITHREADED_PPL_DEMO) - -IF (BULLET2_MULTITHREADED_TBB_DEMO) - SET (BULLET2_TBB_INCLUDE_DIR "not found" CACHE PATH "Directory for Intel TBB includes.") - SET (BULLET2_TBB_LIB_DIR "not found" CACHE PATH "Directory for Intel TBB libraries.") - find_library(TBB_LIBRARY tbb PATHS ${BULLET2_TBB_LIB_DIR}) - find_library(TBBMALLOC_LIBRARY tbbmalloc PATHS ${BULLET2_TBB_LIB_DIR}) - ADD_DEFINITIONS("-DBT_USE_TBB=1") - INCLUDE_DIRECTORIES( ${BULLET2_TBB_INCLUDE_DIR} ) - LINK_LIBRARIES( ${TBB_LIBRARY} ${TBBMALLOC_LIBRARY} ) -ENDIF (BULLET2_MULTITHREADED_TBB_DEMO) SET(ExtendedTutorialsSources ../ExtendedTutorials/Chain.cpp @@ -207,7 +184,6 @@ SET(BulletExampleBrowser_SRCS ../MultiThreadedDemo/MultiThreadedDemo.h ../MultiThreadedDemo/CommonRigidBodyMTBase.cpp ../MultiThreadedDemo/CommonRigidBodyMTBase.h - ../MultiThreadedDemo/ParallelFor.h ../Tutorial/Tutorial.cpp ../Tutorial/Tutorial.h ../Tutorial/Dof6ConstraintTutorial.cpp @@ -386,7 +362,7 @@ ADD_CUSTOM_COMMAND( COMMAND ${CMAKE_COMMAND} ARGS -E copy_directory ${BULLET_PHYSICS_SOURCE_DIR}/data ${PROJECT_BINARY_DIR}/data ) -IF (BULLET2_MULTITHREADED_TBB_DEMO AND WIN32) +IF (BULLET2_USE_TBB_MULTITHREADING AND WIN32) # add a post build command to copy some dlls to the executable directory set(TBB_VC_VER "vc12") set(TBB_VC_ARCH "ia32") @@ -400,7 +376,7 @@ IF (BULLET2_MULTITHREADED_TBB_DEMO AND WIN32) COMMAND ${CMAKE_COMMAND} -E copy_if_different "${BULLET2_TBB_INCLUDE_DIR}/../bin/${TBB_VC_ARCH}/${TBB_VC_VER}/tbbmalloc.dll" $) -ENDIF (BULLET2_MULTITHREADED_TBB_DEMO AND WIN32) +ENDIF (BULLET2_USE_TBB_MULTITHREADING AND WIN32) IF (INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES) diff --git a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp index 81446ee0b..82797e35e 100644 --- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp +++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp @@ -23,10 +23,10 @@ class btCollisionShape; #include "CommonRigidBodyMTBase.h" #include "../CommonInterfaces/CommonParameterInterface.h" -#include "ParallelFor.h" #include "LinearMath/btAlignedObjectArray.h" #include "LinearMath/btPoolAllocator.h" #include "btBulletCollisionCommon.h" +#include "BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h" #include "BulletDynamics/Dynamics/btSimulationIslandManagerMt.h" // for setSplitIslands() #include "BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h" #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h" @@ -36,20 +36,6 @@ class btCollisionShape; #include "BulletDynamics/MLCPSolvers/btDantzigSolver.h" #include "BulletDynamics/MLCPSolvers/btLemkeSolver.h" -TaskManager gTaskMgr; - -#define USE_PARALLEL_NARROWPHASE 1 // detect collisions in parallel -#define USE_PARALLEL_ISLAND_SOLVER 1 // solve simulation islands in parallel -#define USE_PARALLEL_CREATE_PREDICTIVE_CONTACTS 1 -#define USE_PARALLEL_INTEGRATE_TRANSFORMS 1 -#define USE_PARALLEL_PREDICT_UNCONSTRAINED_MOTION 1 - -#if defined (_MSC_VER) && _MSC_VER >= 1600 -// give us a compile error if any signatures of overriden methods is changed -#define BT_OVERRIDE override -#else -#define BT_OVERRIDE -#endif static int gNumIslands = 0; @@ -124,7 +110,7 @@ public: }; -Profiler gProfiler; +static Profiler gProfiler; class ProfileHelper { @@ -141,457 +127,84 @@ public: } }; -int gThreadsRunningCounter = 0; -btSpinMutex gThreadsRunningCounterMutex; - -void btPushThreadsAreRunning() +static void profileBeginCallback( btDynamicsWorld *world, btScalar timeStep ) { - gThreadsRunningCounterMutex.lock(); - gThreadsRunningCounter++; - gThreadsRunningCounterMutex.unlock(); + gProfiler.begin( Profiler::kRecordInternalTimeStep ); } -void btPopThreadsAreRunning() +static void profileEndCallback( btDynamicsWorld *world, btScalar timeStep ) { - gThreadsRunningCounterMutex.lock(); - gThreadsRunningCounter--; - gThreadsRunningCounterMutex.unlock(); -} - -bool btThreadsAreRunning() -{ - return gThreadsRunningCounter != 0; + gProfiler.end( Profiler::kRecordInternalTimeStep ); } -#if USE_PARALLEL_NARROWPHASE - -class MyCollisionDispatcher : public btCollisionDispatcher +/// +/// MyCollisionDispatcher -- subclassed for profiling purposes +/// +class MyCollisionDispatcher : public btCollisionDispatcherMt { - btSpinMutex m_manifoldPtrsMutex; - + typedef btCollisionDispatcherMt ParentClass; public: - MyCollisionDispatcher( btCollisionConfiguration* config ) : btCollisionDispatcher( config ) + MyCollisionDispatcher( btCollisionConfiguration* config, int grainSize ) : btCollisionDispatcherMt( config, grainSize ) { } - virtual ~MyCollisionDispatcher() - { - } - - btPersistentManifold* getNewManifold( const btCollisionObject* body0, const btCollisionObject* body1 ) BT_OVERRIDE - { - // added spin-locks - //optional relative contact breaking threshold, turned on by default (use setDispatcherFlags to switch off feature for improved performance) - - btScalar contactBreakingThreshold = ( m_dispatcherFlags & btCollisionDispatcher::CD_USE_RELATIVE_CONTACT_BREAKING_THRESHOLD ) ? - btMin( body0->getCollisionShape()->getContactBreakingThreshold( gContactBreakingThreshold ), body1->getCollisionShape()->getContactBreakingThreshold( gContactBreakingThreshold ) ) - : gContactBreakingThreshold; - - btScalar contactProcessingThreshold = btMin( body0->getContactProcessingThreshold(), body1->getContactProcessingThreshold() ); - - void* mem = m_persistentManifoldPoolAllocator->allocate( sizeof( btPersistentManifold ) ); - if (NULL == mem) - { - //we got a pool memory overflow, by default we fallback to dynamically allocate memory. If we require a contiguous contact pool then assert. - if ( ( m_dispatcherFlags&CD_DISABLE_CONTACTPOOL_DYNAMIC_ALLOCATION ) == 0 ) - { - mem = btAlignedAlloc( sizeof( btPersistentManifold ), 16 ); - } - else - { - btAssert( 0 ); - //make sure to increase the m_defaultMaxPersistentManifoldPoolSize in the btDefaultCollisionConstructionInfo/btDefaultCollisionConfiguration - return 0; - } - } - btPersistentManifold* manifold = new(mem) btPersistentManifold( body0, body1, 0, contactBreakingThreshold, contactProcessingThreshold ); - m_manifoldPtrsMutex.lock(); - manifold->m_index1a = m_manifoldsPtr.size(); - m_manifoldsPtr.push_back( manifold ); - m_manifoldPtrsMutex.unlock(); - - return manifold; - } - - void releaseManifold( btPersistentManifold* manifold ) BT_OVERRIDE - { - clearManifold( manifold ); - - m_manifoldPtrsMutex.lock(); - int findIndex = manifold->m_index1a; - btAssert( findIndex < m_manifoldsPtr.size() ); - m_manifoldsPtr.swap( findIndex, m_manifoldsPtr.size() - 1 ); - m_manifoldsPtr[ findIndex ]->m_index1a = findIndex; - m_manifoldsPtr.pop_back(); - m_manifoldPtrsMutex.unlock(); - - manifold->~btPersistentManifold(); - if ( m_persistentManifoldPoolAllocator->validPtr( manifold ) ) - { - m_persistentManifoldPoolAllocator->freeMemory( manifold ); - } - else - { - btAlignedFree( manifold ); - } - } - - struct Updater - { - btBroadphasePair* mPairArray; - btNearCallback mCallback; - btCollisionDispatcher* mDispatcher; - const btDispatcherInfo* mInfo; - - Updater() - { - mPairArray = NULL; - mCallback = NULL; - mDispatcher = NULL; - mInfo = NULL; - } - void forLoop( int iBegin, int iEnd ) const - { - for ( int i = iBegin; i < iEnd; ++i ) - { - btBroadphasePair* pair = &mPairArray[ i ]; - mCallback( *pair, *mDispatcher, *mInfo ); - } - } - }; - virtual void dispatchAllCollisionPairs( btOverlappingPairCache* pairCache, const btDispatcherInfo& info, btDispatcher* dispatcher ) BT_OVERRIDE { - ProfileHelper prof(Profiler::kRecordDispatchAllCollisionPairs); - int grainSize = 40; // iterations per task - int pairCount = pairCache->getNumOverlappingPairs(); - Updater updater; - updater.mCallback = getNearCallback(); - updater.mPairArray = pairCount > 0 ? pairCache->getOverlappingPairArrayPtr() : NULL; - updater.mDispatcher = this; - updater.mInfo = &info; - - btPushThreadsAreRunning(); - parallelFor( 0, pairCount, grainSize, updater ); - btPopThreadsAreRunning(); - - if (m_manifoldsPtr.size() < 1) - return; - - // reconstruct the manifolds array to ensure determinism - m_manifoldsPtr.resizeNoInitialize(0); - btBroadphasePair* pairs = pairCache->getOverlappingPairArrayPtr(); - for (int i = 0; i < pairCount; ++i) - { - btCollisionAlgorithm* algo = pairs[i].m_algorithm; - if (algo) algo->getAllContactManifolds(m_manifoldsPtr); - } - - // update the indices (used when releasing manifolds) - for (int i = 0; i < m_manifoldsPtr.size(); ++i) - m_manifoldsPtr[i]->m_index1a = i; + ProfileHelper prof( Profiler::kRecordDispatchAllCollisionPairs ); + ParentClass::dispatchAllCollisionPairs( pairCache, info, dispatcher ); } }; -#endif - -#if USE_PARALLEL_ISLAND_SOLVER /// -/// MyConstraintSolverPool - masquerades as a constraint solver, but really it is a threadsafe pool of them. -/// -/// Each solver in the pool is protected by a mutex. When solveGroup is called from a thread, -/// the pool looks for a solver that isn't being used by another thread, locks it, and dispatches the -/// call to the solver. -/// So long as there are at least as many solvers as there are hardware threads, it should never need to -/// spin wait. -/// -class MyConstraintSolverPool : public btConstraintSolver +/// myParallelIslandDispatch -- wrap default parallel dispatch for profiling and to get the number of simulation islands +// +void myParallelIslandDispatch( btAlignedObjectArray* islandsPtr, btSimulationIslandManagerMt::IslandCallback* callback ) { - const static size_t kCacheLineSize = 128; - struct ThreadSolver - { - btConstraintSolver* solver; - btSpinMutex mutex; - char _cachelinePadding[ kCacheLineSize - sizeof( btSpinMutex ) - sizeof( void* ) ]; // keep mutexes from sharing a cache line - }; - btAlignedObjectArray m_solvers; - btConstraintSolverType m_solverType; - - ThreadSolver* getAndLockThreadSolver() - { - while ( true ) - { - for ( int i = 0; i < m_solvers.size(); ++i ) - { - ThreadSolver& solver = m_solvers[ i ]; - if ( solver.mutex.tryLock() ) - { - return &solver; - } - } - } - return NULL; - } - void init( btConstraintSolver** solvers, int numSolvers ) - { - m_solverType = BT_SEQUENTIAL_IMPULSE_SOLVER; - m_solvers.resize( numSolvers ); - for ( int i = 0; i < numSolvers; ++i ) - { - m_solvers[ i ].solver = solvers[ i ]; - } - if ( numSolvers > 0 ) - { - m_solverType = solvers[ 0 ]->getSolverType(); - } - } -public: - // create the solvers for me - explicit MyConstraintSolverPool( int numSolvers ) - { - btAlignedObjectArray solvers; - solvers.reserve( numSolvers ); - for ( int i = 0; i < numSolvers; ++i ) - { - btConstraintSolver* solver = new btSequentialImpulseConstraintSolver(); - solvers.push_back( solver ); - } - init( &solvers[ 0 ], numSolvers ); - } - - // pass in fully constructed solvers (destructor will delete them) - MyConstraintSolverPool( btConstraintSolver** solvers, int numSolvers ) - { - init( solvers, numSolvers ); - } - virtual ~MyConstraintSolverPool() - { - // delete all solvers - for ( int i = 0; i < m_solvers.size(); ++i ) - { - ThreadSolver& solver = m_solvers[ i ]; - delete solver.solver; - solver.solver = NULL; - } - } - - //virtual void prepareSolve( int /* numBodies */, int /* numManifolds */ ) { ; } // does nothing - - ///solve a group of constraints - virtual btScalar solveGroup( btCollisionObject** bodies, - int numBodies, - btPersistentManifold** manifolds, - int numManifolds, - btTypedConstraint** constraints, - int numConstraints, - const btContactSolverInfo& info, - btIDebugDraw* debugDrawer, - btDispatcher* dispatcher - ) - { - ThreadSolver* solver = getAndLockThreadSolver(); - solver->solver->solveGroup( bodies, numBodies, manifolds, numManifolds, constraints, numConstraints, info, debugDrawer, dispatcher ); - solver->mutex.unlock(); - return 0.0f; - } - - //virtual void allSolved( const btContactSolverInfo& /* info */, class btIDebugDraw* /* debugDrawer */ ) { ; } // does nothing - - ///clear internal cached data and reset random seed - virtual void reset() - { - for ( int i = 0; i < m_solvers.size(); ++i ) - { - ThreadSolver& solver = m_solvers[ i ]; - solver.mutex.lock(); - solver.solver->reset(); - solver.mutex.unlock(); - } - } - - virtual btConstraintSolverType getSolverType() const - { - return m_solverType; - } -}; - -struct UpdateIslandDispatcher -{ - btAlignedObjectArray* islandsPtr; - btSimulationIslandManagerMt::IslandCallback* callback; - - void forLoop( int iBegin, int iEnd ) const - { - for ( int i = iBegin; i < iEnd; ++i ) - { - btSimulationIslandManagerMt::Island* island = ( *islandsPtr )[ i ]; - btPersistentManifold** manifolds = island->manifoldArray.size() ? &island->manifoldArray[ 0 ] : NULL; - btTypedConstraint** constraintsPtr = island->constraintArray.size() ? &island->constraintArray[ 0 ] : NULL; - callback->processIsland( &island->bodyArray[ 0 ], - island->bodyArray.size(), - manifolds, - island->manifoldArray.size(), - constraintsPtr, - island->constraintArray.size(), - island->id - ); - } - } -}; - -void parallelIslandDispatch( btAlignedObjectArray* islandsPtr, btSimulationIslandManagerMt::IslandCallback* callback ) -{ - ProfileHelper prof(Profiler::kRecordDispatchIslands); + ProfileHelper prof( Profiler::kRecordDispatchIslands ); gNumIslands = islandsPtr->size(); - int grainSize = 1; // iterations per task - UpdateIslandDispatcher dispatcher; - dispatcher.islandsPtr = islandsPtr; - dispatcher.callback = callback; - btPushThreadsAreRunning(); - parallelFor( 0, islandsPtr->size(), grainSize, dispatcher ); - btPopThreadsAreRunning(); -} -#endif //#if USE_PARALLEL_ISLAND_SOLVER - - -void profileBeginCallback(btDynamicsWorld *world, btScalar timeStep) -{ - gProfiler.begin(Profiler::kRecordInternalTimeStep); + btSimulationIslandManagerMt::parallelIslandDispatch( islandsPtr, callback ); } -void profileEndCallback(btDynamicsWorld *world, btScalar timeStep) -{ - gProfiler.end(Profiler::kRecordInternalTimeStep); -} /// -/// MyDiscreteDynamicsWorld -/// -/// Should function exactly like btDiscreteDynamicsWorld. -/// 3 methods that iterate over all of the rigidbodies can run in parallel: -/// - predictUnconstraintMotion -/// - integrateTransforms -/// - createPredictiveContacts +/// MyDiscreteDynamicsWorld -- subclassed for profiling purposes /// ATTRIBUTE_ALIGNED16( class ) MyDiscreteDynamicsWorld : public btDiscreteDynamicsWorldMt { typedef btDiscreteDynamicsWorld ParentClass; protected: -#if USE_PARALLEL_PREDICT_UNCONSTRAINED_MOTION - struct UpdaterUnconstrainedMotion - { - btScalar timeStep; - btRigidBody** rigidBodies; - - void forLoop( int iBegin, int iEnd ) const - { - for ( int i = iBegin; i < iEnd; ++i ) - { - btRigidBody* body = rigidBodies[ i ]; - if ( !body->isStaticOrKinematicObject() ) - { - //don't integrate/update velocities here, it happens in the constraint solver - body->applyDamping( timeStep ); - body->predictIntegratedTransform( timeStep, body->getInterpolationWorldTransform() ); - } - } - } - }; virtual void predictUnconstraintMotion( btScalar timeStep ) BT_OVERRIDE { ProfileHelper prof( Profiler::kRecordPredictUnconstrainedMotion ); - BT_PROFILE( "predictUnconstraintMotion" ); - int grainSize = 50; // num of iterations per task for TBB - int bodyCount = m_nonStaticRigidBodies.size(); - UpdaterUnconstrainedMotion update; - update.timeStep = timeStep; - update.rigidBodies = bodyCount ? &m_nonStaticRigidBodies[ 0 ] : NULL; - btPushThreadsAreRunning(); - parallelFor( 0, bodyCount, grainSize, update ); - btPopThreadsAreRunning(); + ParentClass::predictUnconstraintMotion( timeStep ); } -#endif // #if USE_PARALLEL_PREDICT_UNCONSTRAINED_MOTION - -#if USE_PARALLEL_CREATE_PREDICTIVE_CONTACTS - struct UpdaterCreatePredictiveContacts - { - btScalar timeStep; - btRigidBody** rigidBodies; - MyDiscreteDynamicsWorld* world; - - void forLoop( int iBegin, int iEnd ) const - { - world->createPredictiveContactsInternal( &rigidBodies[ iBegin ], iEnd - iBegin, timeStep ); - } - }; - - virtual void createPredictiveContacts( btScalar timeStep ) + virtual void createPredictiveContacts( btScalar timeStep ) BT_OVERRIDE { ProfileHelper prof( Profiler::kRecordCreatePredictiveContacts ); - releasePredictiveContacts(); - int grainSize = 50; // num of iterations per task for TBB or OPENMP - if ( int bodyCount = m_nonStaticRigidBodies.size() ) - { - UpdaterCreatePredictiveContacts update; - update.world = this; - update.timeStep = timeStep; - update.rigidBodies = &m_nonStaticRigidBodies[ 0 ]; - btPushThreadsAreRunning(); - parallelFor( 0, bodyCount, grainSize, update ); - btPopThreadsAreRunning(); - } + ParentClass::createPredictiveContacts( timeStep ); } -#endif // #if USE_PARALLEL_CREATE_PREDICTIVE_CONTACTS - -#if USE_PARALLEL_INTEGRATE_TRANSFORMS - struct UpdaterIntegrateTransforms - { - btScalar timeStep; - btRigidBody** rigidBodies; - MyDiscreteDynamicsWorld* world; - - void forLoop( int iBegin, int iEnd ) const - { - world->integrateTransformsInternal( &rigidBodies[ iBegin ], iEnd - iBegin, timeStep ); - } - }; - virtual void integrateTransforms( btScalar timeStep ) BT_OVERRIDE { ProfileHelper prof( Profiler::kRecordIntegrateTransforms ); - BT_PROFILE( "integrateTransforms" ); - int grainSize = 50; // num of iterations per task for TBB or OPENMP - if ( int bodyCount = m_nonStaticRigidBodies.size() ) - { - UpdaterIntegrateTransforms update; - update.world = this; - update.timeStep = timeStep; - update.rigidBodies = &m_nonStaticRigidBodies[ 0 ]; - btPushThreadsAreRunning(); - parallelFor( 0, bodyCount, grainSize, update ); - btPopThreadsAreRunning(); - } + ParentClass::integrateTransforms( timeStep ); } -#endif // #if USE_PARALLEL_INTEGRATE_TRANSFORMS public: BT_DECLARE_ALIGNED_ALLOCATOR(); MyDiscreteDynamicsWorld( btDispatcher* dispatcher, btBroadphaseInterface* pairCache, - btConstraintSolver* constraintSolver, + btConstraintSolverPoolMt* constraintSolver, btCollisionConfiguration* collisionConfiguration ) : btDiscreteDynamicsWorldMt( dispatcher, pairCache, constraintSolver, collisionConfiguration ) { -#if USE_PARALLEL_ISLAND_SOLVER btSimulationIslandManagerMt* islandMgr = static_cast( m_islandManager ); - islandMgr->setIslandDispatchFunction( parallelIslandDispatch ); -#endif //#if USE_PARALLEL_ISLAND_SOLVER + islandMgr->setIslandDispatchFunction( myParallelIslandDispatch ); } }; @@ -625,6 +238,47 @@ btConstraintSolver* createSolverByType( SolverType t ) } +/// +/// btTaskSchedulerManager -- manage a number of task schedulers so we can switch between them +/// +class btTaskSchedulerManager +{ + btAlignedObjectArray m_taskSchedulers; + +public: + btTaskSchedulerManager() {} + void init() + { + addTaskScheduler( btGetSequentialTaskScheduler() ); + addTaskScheduler( btGetOpenMPTaskScheduler() ); + addTaskScheduler( btGetTBBTaskScheduler() ); + addTaskScheduler( btGetPPLTaskScheduler() ); + if ( getNumTaskSchedulers() > 1 ) + { + // prefer a non-sequential scheduler if available + btSetTaskScheduler( m_taskSchedulers[ 1 ] ); + } + else + { + btSetTaskScheduler( m_taskSchedulers[ 0 ] ); + } + btGetTaskScheduler()->setNumThreads( btGetTaskScheduler()->getMaxNumThreads() ); + } + + void addTaskScheduler( btITaskScheduler* ts ) + { + if ( ts ) + { + m_taskSchedulers.push_back( ts ); + } + } + int getNumTaskSchedulers() const { return m_taskSchedulers.size(); } + btITaskScheduler* getTaskScheduler( int i ) { return m_taskSchedulers[ i ]; } +}; + + +static btTaskSchedulerManager gTaskSchedulerMgr; + static bool gMultithreadedWorld = false; static bool gDisplayProfileInfo = false; static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE; @@ -652,15 +306,17 @@ CommonRigidBodyMTBase::CommonRigidBodyMTBase( struct GUIHelperInterface* helper { m_multithreadedWorld = false; m_multithreadCapable = false; - gTaskMgr.init(); + if ( gTaskSchedulerMgr.getNumTaskSchedulers() == 0 ) + { + gTaskSchedulerMgr.init(); + } } CommonRigidBodyMTBase::~CommonRigidBodyMTBase() { - gTaskMgr.shutdown(); } -void boolPtrButtonCallback(int buttonId, bool buttonState, void* userPointer) +static void boolPtrButtonCallback(int buttonId, bool buttonState, void* userPointer) { if (bool* val = static_cast(userPointer)) { @@ -668,7 +324,7 @@ void boolPtrButtonCallback(int buttonId, bool buttonState, void* userPointer) } } -void toggleSolverModeCallback(int buttonId, bool buttonState, void* userPointer) +static void toggleSolverModeCallback(int buttonId, bool buttonState, void* userPointer) { if (buttonState) { @@ -687,7 +343,7 @@ void toggleSolverModeCallback(int buttonId, bool buttonState, void* userPointer) } } -void setSolverTypeCallback(int buttonId, bool buttonState, void* userPointer) +static void setSolverTypeCallback(int buttonId, bool buttonState, void* userPointer) { if (buttonId >= 0 && buttonId < SOLVER_TYPE_COUNT) { @@ -695,32 +351,30 @@ void setSolverTypeCallback(int buttonId, bool buttonState, void* userPointer) } } -void apiSelectButtonCallback(int buttonId, bool buttonState, void* userPointer) +static void setNumThreads( int numThreads ) { - gTaskMgr.setApi(static_cast(buttonId)); - if (gTaskMgr.getApi()==TaskManager::apiNone) + int newNumThreads = ( std::min )( numThreads, int( BT_MAX_THREAD_COUNT ) ); + int oldNumThreads = btGetTaskScheduler()->getNumThreads(); + // only call when the thread count is different + if ( newNumThreads != oldNumThreads ) { - gSliderNumThreads = 1.0f; - } - else - { - gSliderNumThreads = float(gTaskMgr.getNumThreads()); + btGetTaskScheduler()->setNumThreads( newNumThreads ); } } -void setThreadCountCallback(float val, void* userPtr) +static void apiSelectButtonCallback(int buttonId, bool buttonState, void* userPointer) { - if (gTaskMgr.getApi()==TaskManager::apiNone) - { - gSliderNumThreads = 1.0f; - } - else - { - gTaskMgr.setNumThreads( int( gSliderNumThreads ) ); - } + // change the task scheduler + btSetTaskScheduler( gTaskSchedulerMgr.getTaskScheduler( buttonId ) ); + setNumThreads( int( gSliderNumThreads ) ); } -void setSolverIterationCountCallback(float val, void* userPtr) +static void setThreadCountCallback(float val, void* userPtr) +{ + setNumThreads( int( gSliderNumThreads ) ); +} + +static void setSolverIterationCountCallback(float val, void* userPtr) { if (btDiscreteDynamicsWorld* world = reinterpret_cast(userPtr)) { @@ -733,6 +387,7 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld() gNumIslands = 0; m_solverType = gSolverType; #if BT_THREADSAFE && (BT_USE_OPENMP || BT_USE_PPL || BT_USE_TBB) + btAssert( btGetTaskScheduler() != NULL ); m_multithreadCapable = true; #endif if ( gMultithreadedWorld ) @@ -743,30 +398,24 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld() cci.m_defaultMaxCollisionAlgorithmPoolSize = 80000; m_collisionConfiguration = new btDefaultCollisionConfiguration( cci ); -#if USE_PARALLEL_NARROWPHASE - m_dispatcher = new MyCollisionDispatcher( m_collisionConfiguration ); -#else - m_dispatcher = new btCollisionDispatcher( m_collisionConfiguration ); -#endif //USE_PARALLEL_NARROWPHASE - + m_dispatcher = new MyCollisionDispatcher( m_collisionConfiguration, 40 ); m_broadphase = new btDbvtBroadphase(); -#if BT_THREADSAFE && USE_PARALLEL_ISLAND_SOLVER + btConstraintSolverPoolMt* solverPool; { btConstraintSolver* solvers[ BT_MAX_THREAD_COUNT ]; - int maxThreadCount = btMin( int(BT_MAX_THREAD_COUNT), TaskManager::getMaxNumThreads() ); + int maxThreadCount = BT_MAX_THREAD_COUNT; for ( int i = 0; i < maxThreadCount; ++i ) { solvers[ i ] = createSolverByType( m_solverType ); } - m_solver = new MyConstraintSolverPool( solvers, maxThreadCount ); + solverPool = new btConstraintSolverPoolMt( solvers, maxThreadCount ); + m_solver = solverPool; } -#else - m_solver = createSolverByType( m_solverType ); -#endif //#if USE_PARALLEL_ISLAND_SOLVER - btDiscreteDynamicsWorld* world = new MyDiscreteDynamicsWorld( m_dispatcher, m_broadphase, m_solver, m_collisionConfiguration ); + btDiscreteDynamicsWorld* world = new MyDiscreteDynamicsWorld( m_dispatcher, m_broadphase, solverPool, m_collisionConfiguration ); m_dynamicsWorld = world; m_multithreadedWorld = true; + btAssert( btGetTaskScheduler() != NULL ); } else { @@ -886,24 +535,25 @@ void CommonRigidBodyMTBase::createDefaultParameters() if (m_multithreadedWorld) { // create a button for each supported threading API - for (int iApi = 0; iApi < TaskManager::apiCount; ++iApi) + for ( int iApi = 0; iApi < gTaskSchedulerMgr.getNumTaskSchedulers(); ++iApi ) { - TaskManager::Api api = static_cast(iApi); - if (gTaskMgr.isSupported(api)) - { - char str[1024]; - sprintf(str, "API %s", gTaskMgr.getApiName(api)); - ButtonParams button( str, iApi, false ); - button.m_callback = apiSelectButtonCallback; - m_guiHelper->getParameterInterface()->registerButtonParameter( button ); - } + char str[ 1024 ]; + sprintf( str, "API %s", gTaskSchedulerMgr.getTaskScheduler(iApi)->getName() ); + ButtonParams button( str, iApi, false ); + button.m_callback = apiSelectButtonCallback; + m_guiHelper->getParameterInterface()->registerButtonParameter( button ); } { // create a slider to set the number of threads to use - gSliderNumThreads = float(gTaskMgr.getNumThreads()); + int numThreads = btGetTaskScheduler()->getNumThreads(); + // if slider has not been set yet (by another demo), + if ( gSliderNumThreads <= 1.0f ) + { + gSliderNumThreads = float( numThreads ); + } SliderParams slider("Thread count", &gSliderNumThreads); slider.m_minVal = 1.0f; - slider.m_maxVal = float(gTaskMgr.getMaxNumThreads()*2); + slider.m_maxVal = float( BT_MAX_THREAD_COUNT ); slider.m_callback = setThreadCountCallback; slider.m_clampToIntegers = true; m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider ); @@ -946,14 +596,14 @@ void CommonRigidBodyMTBase::drawScreenText() const btPersistentManifold* man = m_dispatcher->getManifoldByIndexInternal( i ); numContacts += man->getNumContacts(); } - const char* mtApi = TaskManager::getApiName( gTaskMgr.getApi() ); + const char* mtApi = btGetTaskScheduler()->getName(); sprintf( msg, "islands=%d bodies=%d manifolds=%d contacts=%d [%s] threads=%d", gNumIslands, m_dynamicsWorld->getNumCollisionObjects(), numManifolds, numContacts, mtApi, - gTaskMgr.getApi() == TaskManager::apiNone ? 1 : gTaskMgr.getNumThreads() + btGetTaskScheduler()->getNumThreads() ); m_guiHelper->getAppInterface()->drawText( msg, 100, yCoord, 0.4f ); yCoord += yStep; diff --git a/examples/MultiThreadedDemo/ParallelFor.h b/examples/MultiThreadedDemo/ParallelFor.h deleted file mode 100644 index cb1325025..000000000 --- a/examples/MultiThreadedDemo/ParallelFor.h +++ /dev/null @@ -1,336 +0,0 @@ -/* -Bullet Continuous Collision Detection and Physics Library -Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ - -This software is provided 'as-is', without any express or implied warranty. -In no event will the authors be held liable for any damages arising from the use of this software. -Permission is granted to anyone to use this software for any purpose, -including commercial applications, and to alter it and redistribute it freely, -subject to the following restrictions: - -1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. -2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. -3. This notice may not be removed or altered from any source distribution. -*/ - -#include //printf debugging -#include - - -// choose threading providers: -#if BT_USE_TBB -#define USE_TBB 1 // use Intel Threading Building Blocks for thread management -#endif - -#if BT_USE_PPL -#define USE_PPL 1 // use Microsoft Parallel Patterns Library (installed with Visual Studio 2010 and later) -#endif // BT_USE_PPL - -#if BT_USE_OPENMP -#define USE_OPENMP 1 // use OpenMP (also need to change compiler options for OpenMP support) -#endif - - -#if USE_OPENMP - -#include - -#endif // #if USE_OPENMP - - -#if USE_PPL - -#include // if you get a compile error here, check whether your version of Visual Studio includes PPL -// Visual Studio 2010 and later should come with it -#include // for GetProcessorCount() -#endif // #if USE_PPL - - -#if USE_TBB - -#define __TBB_NO_IMPLICIT_LINKAGE 1 -#include -#include -#include -#include - -#endif // #if USE_TBB - - - -class TaskManager -{ -public: - enum Api - { - apiNone, - apiOpenMP, - apiTbb, - apiPpl, - apiCount - }; - static const char* getApiName( Api api ) - { - switch ( api ) - { - case apiNone: return "None"; - case apiOpenMP: return "OpenMP"; - case apiTbb: return "Intel TBB"; - case apiPpl: return "MS PPL"; - default: return "unknown"; - } - } - - TaskManager() - { - m_api = apiNone; - m_numThreads = 0; -#if USE_TBB - m_tbbSchedulerInit = NULL; -#endif // #if USE_TBB - } - - Api getApi() const - { - return m_api; - } - - bool isSupported( Api api ) const - { -#if USE_OPENMP - if ( api == apiOpenMP ) - { - return true; - } -#endif -#if USE_TBB - if ( api == apiTbb ) - { - return true; - } -#endif -#if USE_PPL - if ( api == apiPpl ) - { - return true; - } -#endif - // apiNone is always "supported" - return api == apiNone; - } - - void setApi( Api api ) - { - if (isSupported(api)) - { - m_api = api; - } - else - { - // no compile time support for selected API, fallback to "none" - m_api = apiNone; - } - } - - static int getMaxNumThreads() - { -#if USE_OPENMP - return omp_get_max_threads(); -#elif USE_PPL - return concurrency::GetProcessorCount(); -#elif USE_TBB - return tbb::task_scheduler_init::default_num_threads(); -#endif - return 1; - } - - int getNumThreads() const - { - return m_numThreads; - } - - int setNumThreads( int numThreads ) - { - m_numThreads = ( std::max )( 1, numThreads ); - -#if USE_OPENMP - omp_set_num_threads( m_numThreads ); -#endif - -#if USE_PPL - { - using namespace concurrency; - if ( CurrentScheduler::Id() != -1 ) - { - CurrentScheduler::Detach(); - } - SchedulerPolicy policy; - policy.SetConcurrencyLimits( m_numThreads, m_numThreads ); - CurrentScheduler::Create( policy ); - } -#endif - -#if USE_TBB - if ( m_tbbSchedulerInit ) - { - delete m_tbbSchedulerInit; - m_tbbSchedulerInit = NULL; - } - m_tbbSchedulerInit = new tbb::task_scheduler_init( m_numThreads ); -#endif - return m_numThreads; - } - - void init() - { - if (m_numThreads == 0) - { -#if USE_PPL - setApi( apiPpl ); -#endif -#if USE_TBB - setApi( apiTbb ); -#endif -#if USE_OPENMP - setApi( apiOpenMP ); -#endif - setNumThreads(getMaxNumThreads()); - } - else - { - setNumThreads(m_numThreads); - } - } - - void shutdown() - { -#if USE_TBB - if ( m_tbbSchedulerInit ) - { - delete m_tbbSchedulerInit; - m_tbbSchedulerInit = NULL; - } -#endif - } - -private: - Api m_api; - int m_numThreads; -#if USE_TBB - tbb::task_scheduler_init* m_tbbSchedulerInit; -#endif // #if USE_TBB -}; - -extern TaskManager gTaskMgr; - - -inline static void initTaskScheduler() -{ - gTaskMgr.init(); -} - -inline static void cleanupTaskScheduler() -{ - gTaskMgr.shutdown(); -} - - -#if USE_TBB -/// -/// TbbBodyAdapter -- Converts a body object that implements the -/// "forLoop(int iBegin, int iEnd) const" function -/// into a TBB compatible object that takes a tbb::blocked_range type. -/// -template -struct TbbBodyAdapter -{ - const TBody* mBody; - - void operator()( const tbb::blocked_range& range ) const - { - mBody->forLoop( range.begin(), range.end() ); - } -}; -#endif // #if USE_TBB - -#if USE_PPL -/// -/// PplBodyAdapter -- Converts a body object that implements the -/// "forLoop(int iBegin, int iEnd) const" function -/// into a PPL compatible object that implements "void operator()( int ) const" -/// -template -struct PplBodyAdapter -{ - const TBody* mBody; - int mGrainSize; - int mIndexEnd; - - void operator()( int i ) const - { - mBody->forLoop( i, (std::min)(i + mGrainSize, mIndexEnd) ); - } -}; -#endif // #if USE_PPL - - -/// -/// parallelFor -- interface for submitting work expressed as a for loop to the worker threads -/// -template -void parallelFor( int iBegin, int iEnd, int grainSize, const TBody& body ) -{ -#if USE_OPENMP - if ( gTaskMgr.getApi() == TaskManager::apiOpenMP ) - { -#pragma omp parallel for schedule(static, 1) - for ( int i = iBegin; i < iEnd; i += grainSize ) - { - body.forLoop( i, (std::min)( i + grainSize, iEnd ) ); - } - return; - } -#endif // #if USE_OPENMP - -#if USE_PPL - if ( gTaskMgr.getApi() == TaskManager::apiPpl ) - { - // PPL dispatch - PplBodyAdapter pplBody; - pplBody.mBody = &body; - pplBody.mGrainSize = grainSize; - pplBody.mIndexEnd = iEnd; - // note: MSVC 2010 doesn't support partitioner args, so avoid them - concurrency::parallel_for( iBegin, - iEnd, - grainSize, - pplBody - ); - return; - } -#endif //#if USE_PPL - -#if USE_TBB - if ( gTaskMgr.getApi() == TaskManager::apiTbb ) - { - // TBB dispatch - TbbBodyAdapter tbbBody; - tbbBody.mBody = &body; - tbb::parallel_for( tbb::blocked_range( iBegin, iEnd, grainSize ), - tbbBody, - tbb::simple_partitioner() - ); - return; - } -#endif // #if USE_TBB - - { - // run on main thread - body.forLoop( iBegin, iEnd ); - } - -} - - - - diff --git a/src/BulletCollision/CMakeLists.txt b/src/BulletCollision/CMakeLists.txt index 67afc7e0c..85c5fc8b6 100644 --- a/src/BulletCollision/CMakeLists.txt +++ b/src/BulletCollision/CMakeLists.txt @@ -15,6 +15,7 @@ SET(BulletCollision_SRCS CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp CollisionDispatch/btBoxBoxDetector.cpp CollisionDispatch/btCollisionDispatcher.cpp + CollisionDispatch/btCollisionDispatcherMt.cpp CollisionDispatch/btCollisionObject.cpp CollisionDispatch/btCollisionWorld.cpp CollisionDispatch/btCollisionWorldImporter.cpp @@ -123,6 +124,7 @@ SET(CollisionDispatch_HDRS CollisionDispatch/btCollisionConfiguration.h CollisionDispatch/btCollisionCreateFunc.h CollisionDispatch/btCollisionDispatcher.h + CollisionDispatch/btCollisionDispatcherMt.h CollisionDispatch/btCollisionObject.h CollisionDispatch/btCollisionObjectWrapper.h CollisionDispatch/btCollisionWorld.h diff --git a/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.cpp b/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.cpp new file mode 100644 index 000000000..57cd050e5 --- /dev/null +++ b/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.cpp @@ -0,0 +1,164 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + + +#include "btCollisionDispatcherMt.h" +#include "LinearMath/btQuickprof.h" + +#include "BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h" + +#include "BulletCollision/CollisionShapes/btCollisionShape.h" +#include "BulletCollision/CollisionDispatch/btCollisionObject.h" +#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h" +#include "LinearMath/btPoolAllocator.h" +#include "BulletCollision/CollisionDispatch/btCollisionConfiguration.h" +#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h" + + +btCollisionDispatcherMt::btCollisionDispatcherMt( btCollisionConfiguration* config, int grainSize ) + : btCollisionDispatcher( config ) +{ + m_batchUpdating = false; + m_grainSize = grainSize; // iterations per task +} + + +btPersistentManifold* btCollisionDispatcherMt::getNewManifold( const btCollisionObject* body0, const btCollisionObject* body1 ) +{ + //optional relative contact breaking threshold, turned on by default (use setDispatcherFlags to switch off feature for improved performance) + + btScalar contactBreakingThreshold = ( m_dispatcherFlags & btCollisionDispatcher::CD_USE_RELATIVE_CONTACT_BREAKING_THRESHOLD ) ? + btMin( body0->getCollisionShape()->getContactBreakingThreshold( gContactBreakingThreshold ), body1->getCollisionShape()->getContactBreakingThreshold( gContactBreakingThreshold ) ) + : gContactBreakingThreshold; + + btScalar contactProcessingThreshold = btMin( body0->getContactProcessingThreshold(), body1->getContactProcessingThreshold() ); + + void* mem = m_persistentManifoldPoolAllocator->allocate( sizeof( btPersistentManifold ) ); + if ( NULL == mem ) + { + //we got a pool memory overflow, by default we fallback to dynamically allocate memory. If we require a contiguous contact pool then assert. + if ( ( m_dispatcherFlags&CD_DISABLE_CONTACTPOOL_DYNAMIC_ALLOCATION ) == 0 ) + { + mem = btAlignedAlloc( sizeof( btPersistentManifold ), 16 ); + } + else + { + btAssert( 0 ); + //make sure to increase the m_defaultMaxPersistentManifoldPoolSize in the btDefaultCollisionConstructionInfo/btDefaultCollisionConfiguration + return 0; + } + } + btPersistentManifold* manifold = new( mem ) btPersistentManifold( body0, body1, 0, contactBreakingThreshold, contactProcessingThreshold ); + if ( !m_batchUpdating ) + { + // batch updater will update manifold pointers array after finishing, so + // only need to update array when not batch-updating + btAssert( !btThreadsAreRunning() ); + manifold->m_index1a = m_manifoldsPtr.size(); + m_manifoldsPtr.push_back( manifold ); + } + + return manifold; +} + +void btCollisionDispatcherMt::releaseManifold( btPersistentManifold* manifold ) +{ + clearManifold( manifold ); + btAssert( !btThreadsAreRunning() ); + if ( !m_batchUpdating ) + { + // batch updater will update manifold pointers array after finishing, so + // only need to update array when not batch-updating + int findIndex = manifold->m_index1a; + btAssert( findIndex < m_manifoldsPtr.size() ); + m_manifoldsPtr.swap( findIndex, m_manifoldsPtr.size() - 1 ); + m_manifoldsPtr[ findIndex ]->m_index1a = findIndex; + m_manifoldsPtr.pop_back(); + } + + manifold->~btPersistentManifold(); + if ( m_persistentManifoldPoolAllocator->validPtr( manifold ) ) + { + m_persistentManifoldPoolAllocator->freeMemory( manifold ); + } + else + { + btAlignedFree( manifold ); + } +} + +struct CollisionDispatcherUpdater : public btIParallelForBody +{ + btBroadphasePair* mPairArray; + btNearCallback mCallback; + btCollisionDispatcher* mDispatcher; + const btDispatcherInfo* mInfo; + + CollisionDispatcherUpdater() + { + mPairArray = NULL; + mCallback = NULL; + mDispatcher = NULL; + mInfo = NULL; + } + void forLoop( int iBegin, int iEnd ) const + { + for ( int i = iBegin; i < iEnd; ++i ) + { + btBroadphasePair* pair = &mPairArray[ i ]; + mCallback( *pair, *mDispatcher, *mInfo ); + } + } +}; + + +void btCollisionDispatcherMt::dispatchAllCollisionPairs( btOverlappingPairCache* pairCache, const btDispatcherInfo& info, btDispatcher* dispatcher ) +{ + int pairCount = pairCache->getNumOverlappingPairs(); + if ( pairCount == 0 ) + { + return; + } + CollisionDispatcherUpdater updater; + updater.mCallback = getNearCallback(); + updater.mPairArray = pairCache->getOverlappingPairArrayPtr(); + updater.mDispatcher = this; + updater.mInfo = &info; + + m_batchUpdating = true; + btParallelFor( 0, pairCount, m_grainSize, updater ); + m_batchUpdating = false; + + // reconstruct the manifolds array to ensure determinism + m_manifoldsPtr.resizeNoInitialize( 0 ); + + btBroadphasePair* pairs = pairCache->getOverlappingPairArrayPtr(); + for ( int i = 0; i < pairCount; ++i ) + { + if (btCollisionAlgorithm* algo = pairs[ i ].m_algorithm) + { + algo->getAllContactManifolds( m_manifoldsPtr ); + } + } + + // update the indices (used when releasing manifolds) + for ( int i = 0; i < m_manifoldsPtr.size(); ++i ) + { + m_manifoldsPtr[ i ]->m_index1a = i; + } +} + + diff --git a/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h b/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h new file mode 100644 index 000000000..f1d7eafdc --- /dev/null +++ b/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h @@ -0,0 +1,39 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef BT_COLLISION_DISPATCHER_MT_H +#define BT_COLLISION_DISPATCHER_MT_H + +#include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h" +#include "LinearMath/btThreads.h" + + +class btCollisionDispatcherMt : public btCollisionDispatcher +{ +public: + btCollisionDispatcherMt( btCollisionConfiguration* config, int grainSize = 40 ); + + virtual btPersistentManifold* getNewManifold( const btCollisionObject* body0, const btCollisionObject* body1 ) BT_OVERRIDE; + virtual void releaseManifold( btPersistentManifold* manifold ) BT_OVERRIDE; + + virtual void dispatchAllCollisionPairs( btOverlappingPairCache* pairCache, const btDispatcherInfo& info, btDispatcher* dispatcher ) BT_OVERRIDE; + +protected: + bool m_batchUpdating; + int m_grainSize; +}; + +#endif //BT_COLLISION_DISPATCHER_MT_H + diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp index 5e51a994c..faa8193dd 100644 --- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp +++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp @@ -108,8 +108,105 @@ struct InplaceSolverIslandCallbackMt : public btSimulationIslandManagerMt::Islan }; +/// +/// btConstraintSolverPoolMt +/// -btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,btBroadphaseInterface* pairCache,btConstraintSolver* constraintSolver, btCollisionConfiguration* collisionConfiguration) +btConstraintSolverPoolMt::ThreadSolver* btConstraintSolverPoolMt::getAndLockThreadSolver() +{ + int i = btGetCurrentThreadIndex() % m_solvers.size(); + while ( true ) + { + ThreadSolver& solver = m_solvers[ i ]; + if ( solver.mutex.tryLock() ) + { + return &solver; + } + // failed, try the next one + i = ( i + 1 ) % m_solvers.size(); + } + return NULL; +} + +void btConstraintSolverPoolMt::init( btConstraintSolver** solvers, int numSolvers ) +{ + m_solverType = BT_SEQUENTIAL_IMPULSE_SOLVER; + m_solvers.resize( numSolvers ); + for ( int i = 0; i < numSolvers; ++i ) + { + m_solvers[ i ].solver = solvers[ i ]; + } + if ( numSolvers > 0 ) + { + m_solverType = solvers[ 0 ]->getSolverType(); + } +} + +// create the solvers for me +btConstraintSolverPoolMt::btConstraintSolverPoolMt( int numSolvers ) +{ + btAlignedObjectArray solvers; + solvers.reserve( numSolvers ); + for ( int i = 0; i < numSolvers; ++i ) + { + btConstraintSolver* solver = new btSequentialImpulseConstraintSolver(); + solvers.push_back( solver ); + } + init( &solvers[ 0 ], numSolvers ); +} + +// pass in fully constructed solvers (destructor will delete them) +btConstraintSolverPoolMt::btConstraintSolverPoolMt( btConstraintSolver** solvers, int numSolvers ) +{ + init( solvers, numSolvers ); +} + +btConstraintSolverPoolMt::~btConstraintSolverPoolMt() +{ + // delete all solvers + for ( int i = 0; i < m_solvers.size(); ++i ) + { + ThreadSolver& solver = m_solvers[ i ]; + delete solver.solver; + solver.solver = NULL; + } +} + +///solve a group of constraints +btScalar btConstraintSolverPoolMt::solveGroup( btCollisionObject** bodies, + int numBodies, + btPersistentManifold** manifolds, + int numManifolds, + btTypedConstraint** constraints, + int numConstraints, + const btContactSolverInfo& info, + btIDebugDraw* debugDrawer, + btDispatcher* dispatcher +) +{ + ThreadSolver* ts = getAndLockThreadSolver(); + ts->solver->solveGroup( bodies, numBodies, manifolds, numManifolds, constraints, numConstraints, info, debugDrawer, dispatcher ); + ts->mutex.unlock(); + return 0.0f; +} + +void btConstraintSolverPoolMt::reset() +{ + for ( int i = 0; i < m_solvers.size(); ++i ) + { + ThreadSolver& solver = m_solvers[ i ]; + solver.mutex.lock(); + solver.solver->reset(); + solver.mutex.unlock(); + } +} + + +/// +/// btDiscreteDynamicsWorldMt +/// + +btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher, btBroadphaseInterface* pairCache, btConstraintSolverPoolMt* constraintSolver, btCollisionConfiguration* collisionConfiguration) : btDiscreteDynamicsWorld(dispatcher,pairCache,constraintSolver,collisionConfiguration) { if (m_ownsIslandManager) @@ -124,8 +221,8 @@ btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,bt { void* mem = btAlignedAlloc(sizeof(btSimulationIslandManagerMt),16); btSimulationIslandManagerMt* im = new (mem) btSimulationIslandManagerMt(); - m_islandManager = im; im->setMinimumSolverBatchSize( m_solverInfo.m_minimumSolverBatchSize ); + m_islandManager = im; } } @@ -145,7 +242,7 @@ btDiscreteDynamicsWorldMt::~btDiscreteDynamicsWorldMt() } -void btDiscreteDynamicsWorldMt::solveConstraints(btContactSolverInfo& solverInfo) +void btDiscreteDynamicsWorldMt::solveConstraints(btContactSolverInfo& solverInfo) { BT_PROFILE("solveConstraints"); @@ -160,3 +257,65 @@ void btDiscreteDynamicsWorldMt::solveConstraints(btContactSolverInfo& solverInfo } +struct UpdaterUnconstrainedMotion : public btIParallelForBody +{ + btScalar timeStep; + btRigidBody** rigidBodies; + + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + for ( int i = iBegin; i < iEnd; ++i ) + { + btRigidBody* body = rigidBodies[ i ]; + if ( !body->isStaticOrKinematicObject() ) + { + //don't integrate/update velocities here, it happens in the constraint solver + body->applyDamping( timeStep ); + body->predictIntegratedTransform( timeStep, body->getInterpolationWorldTransform() ); + } + } + } +}; + + +void btDiscreteDynamicsWorldMt::predictUnconstraintMotion( btScalar timeStep ) +{ + BT_PROFILE( "predictUnconstraintMotion" ); + int grainSize = 50; // num of iterations per task for TBB + int bodyCount = m_nonStaticRigidBodies.size(); + UpdaterUnconstrainedMotion update; + update.timeStep = timeStep; + update.rigidBodies = bodyCount ? &m_nonStaticRigidBodies[ 0 ] : NULL; + btParallelFor( 0, bodyCount, grainSize, update ); +} + + +void btDiscreteDynamicsWorldMt::createPredictiveContacts( btScalar timeStep ) +{ + releasePredictiveContacts(); + int grainSize = 50; // num of iterations per task for TBB or OPENMP + if ( int bodyCount = m_nonStaticRigidBodies.size() ) + { + UpdaterCreatePredictiveContacts update; + update.world = this; + update.timeStep = timeStep; + update.rigidBodies = &m_nonStaticRigidBodies[ 0 ]; + btParallelFor( 0, bodyCount, grainSize, update ); + } +} + + +void btDiscreteDynamicsWorldMt::integrateTransforms( btScalar timeStep ) +{ + BT_PROFILE( "integrateTransforms" ); + int grainSize = 50; // num of iterations per task for TBB or OPENMP + if ( int bodyCount = m_nonStaticRigidBodies.size() ) + { + UpdaterIntegrateTransforms update; + update.world = this; + update.timeStep = timeStep; + update.rigidBodies = &m_nonStaticRigidBodies[ 0 ]; + btParallelFor( 0, bodyCount, grainSize, update ); + } +} + diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h index b28371b51..2f144cdda 100644 --- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h +++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h @@ -18,24 +18,116 @@ subject to the following restrictions: #define BT_DISCRETE_DYNAMICS_WORLD_MT_H #include "btDiscreteDynamicsWorld.h" +#include "btSimulationIslandManagerMt.h" +#include "BulletDynamics/ConstraintSolver/btConstraintSolver.h" struct InplaceSolverIslandCallbackMt; +/// +/// btConstraintSolverPoolMt - masquerades as a constraint solver, but really it is a threadsafe pool of them. +/// +/// Each solver in the pool is protected by a mutex. When solveGroup is called from a thread, +/// the pool looks for a solver that isn't being used by another thread, locks it, and dispatches the +/// call to the solver. +/// So long as there are at least as many solvers as there are hardware threads, it should never need to +/// spin wait. +/// +class btConstraintSolverPoolMt : public btConstraintSolver +{ +public: + // create the solvers for me + explicit btConstraintSolverPoolMt( int numSolvers ); + + // pass in fully constructed solvers (destructor will delete them) + btConstraintSolverPoolMt( btConstraintSolver** solvers, int numSolvers ); + + virtual ~btConstraintSolverPoolMt(); + + ///solve a group of constraints + virtual btScalar solveGroup( btCollisionObject** bodies, + int numBodies, + btPersistentManifold** manifolds, + int numManifolds, + btTypedConstraint** constraints, + int numConstraints, + const btContactSolverInfo& info, + btIDebugDraw* debugDrawer, + btDispatcher* dispatcher + ) BT_OVERRIDE; + + virtual void reset() BT_OVERRIDE; + virtual btConstraintSolverType getSolverType() const BT_OVERRIDE { return m_solverType; } + +private: + const static size_t kCacheLineSize = 128; + struct ThreadSolver + { + btConstraintSolver* solver; + btSpinMutex mutex; + char _cachelinePadding[ kCacheLineSize - sizeof( btSpinMutex ) - sizeof( void* ) ]; // keep mutexes from sharing a cache line + }; + btAlignedObjectArray m_solvers; + btConstraintSolverType m_solverType; + + ThreadSolver* getAndLockThreadSolver(); + void init( btConstraintSolver** solvers, int numSolvers ); +}; + + + /// /// btDiscreteDynamicsWorldMt -- a version of DiscreteDynamicsWorld with some minor changes to support /// solving simulation islands on multiple threads. /// +/// Should function exactly like btDiscreteDynamicsWorld. +/// Also 3 methods that iterate over all of the rigidbodies can run in parallel: +/// - predictUnconstraintMotion +/// - integrateTransforms +/// - createPredictiveContacts +/// ATTRIBUTE_ALIGNED16(class) btDiscreteDynamicsWorldMt : public btDiscreteDynamicsWorld { protected: InplaceSolverIslandCallbackMt* m_solverIslandCallbackMt; - virtual void solveConstraints(btContactSolverInfo& solverInfo); + virtual void solveConstraints(btContactSolverInfo& solverInfo) BT_OVERRIDE; + + virtual void predictUnconstraintMotion( btScalar timeStep ) BT_OVERRIDE; + + struct UpdaterCreatePredictiveContacts : public btIParallelForBody + { + btScalar timeStep; + btRigidBody** rigidBodies; + btDiscreteDynamicsWorldMt* world; + + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + world->createPredictiveContactsInternal( &rigidBodies[ iBegin ], iEnd - iBegin, timeStep ); + } + }; + virtual void createPredictiveContacts( btScalar timeStep ) BT_OVERRIDE; + + struct UpdaterIntegrateTransforms : public btIParallelForBody + { + btScalar timeStep; + btRigidBody** rigidBodies; + btDiscreteDynamicsWorldMt* world; + + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + world->integrateTransformsInternal( &rigidBodies[ iBegin ], iEnd - iBegin, timeStep ); + } + }; + virtual void integrateTransforms( btScalar timeStep ) BT_OVERRIDE; public: BT_DECLARE_ALIGNED_ALLOCATOR(); - btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,btBroadphaseInterface* pairCache,btConstraintSolver* constraintSolver,btCollisionConfiguration* collisionConfiguration); + btDiscreteDynamicsWorldMt(btDispatcher* dispatcher, + btBroadphaseInterface* pairCache, + btConstraintSolverPoolMt* constraintSolver, // Note this should be a solver-pool for multi-threading + btCollisionConfiguration* collisionConfiguration + ); virtual ~btDiscreteDynamicsWorldMt(); }; diff --git a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp index ad63b6ee0..6373f60ff 100644 --- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp +++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp @@ -15,6 +15,7 @@ subject to the following restrictions: #include "LinearMath/btScalar.h" +#include "LinearMath/btThreads.h" #include "btSimulationIslandManagerMt.h" #include "BulletCollision/BroadphaseCollision/btDispatcher.h" #include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h" @@ -44,7 +45,7 @@ btSimulationIslandManagerMt::btSimulationIslandManagerMt() { m_minimumSolverBatchSize = calcBatchCost(0, 128, 0); m_batchIslandMinBodyCount = 32; - m_islandDispatch = defaultIslandDispatch; + m_islandDispatch = parallelIslandDispatch; m_batchIsland = NULL; } @@ -545,7 +546,7 @@ void btSimulationIslandManagerMt::mergeIslands() } -void btSimulationIslandManagerMt::defaultIslandDispatch( btAlignedObjectArray* islandsPtr, IslandCallback* callback ) +void btSimulationIslandManagerMt::serialIslandDispatch( btAlignedObjectArray* islandsPtr, IslandCallback* callback ) { // serial dispatch btAlignedObjectArray& islands = *islandsPtr; @@ -565,6 +566,40 @@ void btSimulationIslandManagerMt::defaultIslandDispatch( btAlignedObjectArray* islandsPtr; + btSimulationIslandManagerMt::IslandCallback* callback; + + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + for ( int i = iBegin; i < iEnd; ++i ) + { + btSimulationIslandManagerMt::Island* island = ( *islandsPtr )[ i ]; + btPersistentManifold** manifolds = island->manifoldArray.size() ? &island->manifoldArray[ 0 ] : NULL; + btTypedConstraint** constraintsPtr = island->constraintArray.size() ? &island->constraintArray[ 0 ] : NULL; + callback->processIsland( &island->bodyArray[ 0 ], + island->bodyArray.size(), + manifolds, + island->manifoldArray.size(), + constraintsPtr, + island->constraintArray.size(), + island->id + ); + } + } +}; + +void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray* islandsPtr, IslandCallback* callback ) +{ + int grainSize = 1; // iterations per task + UpdateIslandDispatcher dispatcher; + dispatcher.islandsPtr = islandsPtr; + dispatcher.callback = callback; + btParallelFor( 0, islandsPtr->size(), grainSize, dispatcher ); +} + + ///@todo: this is random access, it can be walked 'cache friendly'! void btSimulationIslandManagerMt::buildAndProcessIslands( btDispatcher* dispatcher, btCollisionWorld* collisionWorld, diff --git a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h index 117061623..e3e0ea523 100644 --- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h +++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h @@ -59,7 +59,8 @@ public: ) = 0; }; typedef void( *IslandDispatchFunc ) ( btAlignedObjectArray* islands, IslandCallback* callback ); - static void defaultIslandDispatch( btAlignedObjectArray* islands, IslandCallback* callback ); + static void serialIslandDispatch( btAlignedObjectArray* islands, IslandCallback* callback ); + static void parallelIslandDispatch( btAlignedObjectArray* islandsPtr, IslandCallback* callback ); protected: btAlignedObjectArray m_allocatedIslands; // owner of all Islands btAlignedObjectArray m_activeIslands; // islands actively in use diff --git a/src/LinearMath/btThreads.cpp b/src/LinearMath/btThreads.cpp index b72301a2e..3c73255a1 100644 --- a/src/LinearMath/btThreads.cpp +++ b/src/LinearMath/btThreads.cpp @@ -14,6 +14,247 @@ subject to the following restrictions: #include "btThreads.h" +#include // for min and max + +#if BT_THREADSAFE + +#if BT_USE_OPENMP + +#include + +#endif // #if BT_USE_OPENMP + + +#if BT_USE_PPL + +// use Microsoft Parallel Patterns Library (installed with Visual Studio 2010 and later) +#include // if you get a compile error here, check whether your version of Visual Studio includes PPL +// Visual Studio 2010 and later should come with it +#include // for GetProcessorCount() + +#endif // #if BT_USE_PPL + + +#if BT_USE_TBB + +// use Intel Threading Building Blocks for thread management +#define __TBB_NO_IMPLICIT_LINKAGE 1 +#include +#include +#include +#include + +#endif // #if BT_USE_TBB + + +static btITaskScheduler* gBtTaskScheduler; +static int gThreadsRunningCounter = 0; // useful for detecting if we are trying to do nested parallel-for calls +static btSpinMutex gThreadsRunningCounterMutex; + +void btPushThreadsAreRunning() +{ + gThreadsRunningCounterMutex.lock(); + gThreadsRunningCounter++; + gThreadsRunningCounterMutex.unlock(); +} + +void btPopThreadsAreRunning() +{ + gThreadsRunningCounterMutex.lock(); + gThreadsRunningCounter--; + gThreadsRunningCounterMutex.unlock(); +} + +bool btThreadsAreRunning() +{ + return gThreadsRunningCounter != 0; +} + + +void btSetTaskScheduler( btITaskScheduler* ts ) +{ + gBtTaskScheduler = ts; +} + +btITaskScheduler* btGetTaskScheduler() +{ + return gBtTaskScheduler; +} + +void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) +{ + gBtTaskScheduler->parallelFor( iBegin, iEnd, grainSize, body ); +} + + +#if BT_USE_OPENMP +/// +/// btTaskSchedulerOpenMP -- OpenMP task scheduler implementation +/// +class btTaskSchedulerOpenMP : public btITaskScheduler +{ + int m_numThreads; +public: + btTaskSchedulerOpenMP() : btITaskScheduler( "OpenMP" ) + { + m_numThreads = 0; + } + virtual int getMaxNumThreads() const BT_OVERRIDE + { + return omp_get_max_threads(); + } + virtual int getNumThreads() const BT_OVERRIDE + { + return m_numThreads; + } + virtual void setNumThreads( int numThreads ) BT_OVERRIDE + { + m_numThreads = ( std::max )( 1, numThreads ); + omp_set_num_threads( m_numThreads ); + } + virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE + { + btPushThreadsAreRunning(); +#pragma omp parallel for schedule( static, 1 ) + for ( int i = iBegin; i < iEnd; i += grainSize ) + { + body.forLoop( i, ( std::min )( i + grainSize, iEnd ) ); + } + btPopThreadsAreRunning(); + } +}; +#endif // #if BT_USE_OPENMP + + +#if BT_USE_TBB +/// +/// btTaskSchedulerTBB -- task scheduler implemented via Intel Threaded Building Blocks +/// +class btTaskSchedulerTBB : public btITaskScheduler +{ + int m_numThreads; + tbb::task_scheduler_init* m_tbbSchedulerInit; + +public: + btTaskSchedulerTBB() : btITaskScheduler( "IntelTBB" ) + { + m_numThreads = 0; + m_tbbSchedulerInit = NULL; + } + ~btTaskSchedulerTBB() + { + if ( m_tbbSchedulerInit ) + { + delete m_tbbSchedulerInit; + m_tbbSchedulerInit = NULL; + } + } + + virtual int getMaxNumThreads() const BT_OVERRIDE + { + return tbb::task_scheduler_init::default_num_threads(); + } + virtual int getNumThreads() const BT_OVERRIDE + { + return m_numThreads; + } + virtual void setNumThreads( int numThreads ) BT_OVERRIDE + { + m_numThreads = ( std::max )( 1, numThreads ); + if ( m_tbbSchedulerInit ) + { + delete m_tbbSchedulerInit; + m_tbbSchedulerInit = NULL; + } + m_tbbSchedulerInit = new tbb::task_scheduler_init( m_numThreads ); + } + struct BodyAdapter + { + const btIParallelForBody* mBody; + + void operator()( const tbb::blocked_range& range ) const + { + mBody->forLoop( range.begin(), range.end() ); + } + }; + virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE + { + // TBB dispatch + BodyAdapter tbbBody; + tbbBody.mBody = &body; + btPushThreadsAreRunning(); + tbb::parallel_for( tbb::blocked_range( iBegin, iEnd, grainSize ), + tbbBody, + tbb::simple_partitioner() + ); + btPopThreadsAreRunning(); + } +}; +#endif // #if BT_USE_TBB + +#if BT_USE_PPL +/// +/// btTaskSchedulerPPL -- task scheduler implemented via Microsoft Parallel Patterns Lib +/// +class btTaskSchedulerPPL : public btITaskScheduler +{ + int m_numThreads; +public: + btTaskSchedulerPPL() : btITaskScheduler( "PPL" ) + { + m_numThreads = 0; + } + virtual int getMaxNumThreads() const BT_OVERRIDE + { + return concurrency::GetProcessorCount(); + } + virtual int getNumThreads() const BT_OVERRIDE + { + return m_numThreads; + } + virtual void setNumThreads( int numThreads ) BT_OVERRIDE + { + m_numThreads = ( std::max )( 1, numThreads ); + using namespace concurrency; + if ( CurrentScheduler::Id() != -1 ) + { + CurrentScheduler::Detach(); + } + SchedulerPolicy policy; + policy.SetConcurrencyLimits( m_numThreads, m_numThreads ); + CurrentScheduler::Create( policy ); + } + struct BodyAdapter + { + const btIParallelForBody* mBody; + int mGrainSize; + int mIndexEnd; + + void operator()( int i ) const + { + mBody->forLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) ); + } + }; + virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE + { + // PPL dispatch + BodyAdapter pplBody; + pplBody.mBody = &body; + pplBody.mGrainSize = grainSize; + pplBody.mIndexEnd = iEnd; + btPushThreadsAreRunning(); + // note: MSVC 2010 doesn't support partitioner args, so avoid them + concurrency::parallel_for( iBegin, + iEnd, + grainSize, + pplBody + ); + btPopThreadsAreRunning(); + } +}; +#endif // #if BT_USE_PPL + + // // Lightweight spin-mutex based on atomics @@ -22,8 +263,6 @@ subject to the following restrictions: // context switching. // -#if BT_THREADSAFE - #if __cplusplus >= 201103L // for anything claiming full C++11 compliance, use C++11 atomics @@ -229,3 +468,64 @@ bool btSpinMutex::tryLock() #endif // #if BT_THREADSAFE + +/// +/// btTaskSchedulerSequential -- non-threaded implementation of task scheduler +/// (fallback in case no multi-threaded schedulers are available) +/// +class btTaskSchedulerSequential : public btITaskScheduler +{ +public: + btTaskSchedulerSequential() : btITaskScheduler( "Sequential" ) {} + virtual int getMaxNumThreads() const BT_OVERRIDE { return 1; } + virtual int getNumThreads() const BT_OVERRIDE { return 1; } + virtual void setNumThreads( int numThreads ) BT_OVERRIDE {} + virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE + { + body.forLoop( iBegin, iEnd ); + } +}; + +// create a non-threaded task scheduler (always available) +btITaskScheduler* btGetSequentialTaskScheduler() +{ + static btTaskSchedulerSequential sTaskScheduler; + return &sTaskScheduler; +} + + +// create an OpenMP task scheduler (if available, otherwise returns null) +btITaskScheduler* btGetOpenMPTaskScheduler() +{ +#if BT_USE_OPENMP && BT_THREADSAFE + static btTaskSchedulerOpenMP sTaskScheduler; + return &sTaskScheduler; +#else + return NULL; +#endif +} + + +// create an Intel TBB task scheduler (if available, otherwise returns null) +btITaskScheduler* btGetTBBTaskScheduler() +{ +#if BT_USE_TBB && BT_THREADSAFE + static btTaskSchedulerTBB sTaskScheduler; + return &sTaskScheduler; +#else + return NULL; +#endif +} + + +// create a PPL task scheduler (if available, otherwise returns null) +btITaskScheduler* btGetPPLTaskScheduler() +{ +#if BT_USE_PPL && BT_THREADSAFE + static btTaskSchedulerPPL sTaskScheduler; + return &sTaskScheduler; +#else + return NULL; +#endif +} + diff --git a/src/LinearMath/btThreads.h b/src/LinearMath/btThreads.h index a15e5250b..b592ec3e5 100644 --- a/src/LinearMath/btThreads.h +++ b/src/LinearMath/btThreads.h @@ -19,6 +19,15 @@ subject to the following restrictions: #include "btScalar.h" // has definitions like SIMD_FORCE_INLINE +#if defined (_MSC_VER) && _MSC_VER >= 1600 +// give us a compile error if any signatures of overriden methods is changed +#define BT_OVERRIDE override +#endif + +#ifndef BT_OVERRIDE +#define BT_OVERRIDE +#endif + /// /// btSpinMutex -- lightweight spin-mutex implemented with atomic ops, never puts /// a thread to sleep because it is designed to be used with a task scheduler @@ -59,6 +68,7 @@ SIMD_FORCE_INLINE bool btMutexTryLock( btSpinMutex* mutex ) // for internal use only bool btIsMainThread(); +bool btThreadsAreRunning(); unsigned int btGetCurrentThreadIndex(); const unsigned int BT_MAX_THREAD_COUNT = 64; @@ -71,5 +81,55 @@ SIMD_FORCE_INLINE void btMutexUnlock( btSpinMutex* ) {} SIMD_FORCE_INLINE bool btMutexTryLock( btSpinMutex* ) {return true;} #endif +// +// btIParallelForBody -- subclass this to express work that can be done in parallel +// +class btIParallelForBody +{ +public: + virtual void forLoop( int iBegin, int iEnd ) const = 0; +}; + +// +// btITaskScheduler -- subclass this to implement a task scheduler that can dispatch work to +// worker threads +// +class btITaskScheduler +{ + const char* m_name; +public: + btITaskScheduler( const char* name ) : m_name( name ) {} + const char* getName() const { return m_name; } + + virtual ~btITaskScheduler() {} + virtual int getMaxNumThreads() const = 0; + virtual int getNumThreads() const = 0; + virtual void setNumThreads( int numThreads ) = 0; + virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) = 0; +}; + +// set the task scheduler to use for all calls to btParallelFor() +// NOTE: you must set this prior to using any of the multi-threaded "Mt" classes +void btSetTaskScheduler( btITaskScheduler* ts ); + +// get the current task scheduler +btITaskScheduler* btGetTaskScheduler(); + +// get non-threaded task scheduler (always available) +btITaskScheduler* btGetSequentialTaskScheduler(); + +// get OpenMP task scheduler (if available, otherwise returns null) +btITaskScheduler* btGetOpenMPTaskScheduler(); + +// get Intel TBB task scheduler (if available, otherwise returns null) +btITaskScheduler* btGetTBBTaskScheduler(); + +// get PPL task scheduler (if available, otherwise returns null) +btITaskScheduler* btGetPPLTaskScheduler(); + +// btParallelFor -- call this to dispatch work like a for-loop +// (iterations may be done out of order, so no dependencies are allowed) +void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ); + #endif //BT_THREADS_H From d77c3d5b6812555467835d65a1441f2698d42495 Mon Sep 17 00:00:00 2001 From: Lunkhound Date: Tue, 23 May 2017 02:34:29 -0700 Subject: [PATCH 2/7] fix compile errors in non-threadsafe build --- examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp | 12 ++++++++++++ .../Dynamics/btDiscreteDynamicsWorldMt.cpp | 5 ++++- src/LinearMath/btThreads.cpp | 6 ++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp index 82797e35e..a47b76da6 100644 --- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp +++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp @@ -250,6 +250,7 @@ public: void init() { addTaskScheduler( btGetSequentialTaskScheduler() ); +#if BT_THREADSAFE addTaskScheduler( btGetOpenMPTaskScheduler() ); addTaskScheduler( btGetTBBTaskScheduler() ); addTaskScheduler( btGetPPLTaskScheduler() ); @@ -263,6 +264,7 @@ public: btSetTaskScheduler( m_taskSchedulers[ 0 ] ); } btGetTaskScheduler()->setNumThreads( btGetTaskScheduler()->getMaxNumThreads() ); +#endif // #if BT_THREADSAFE } void addTaskScheduler( btITaskScheduler* ts ) @@ -353,6 +355,7 @@ static void setSolverTypeCallback(int buttonId, bool buttonState, void* userPoin static void setNumThreads( int numThreads ) { +#if BT_THREADSAFE int newNumThreads = ( std::min )( numThreads, int( BT_MAX_THREAD_COUNT ) ); int oldNumThreads = btGetTaskScheduler()->getNumThreads(); // only call when the thread count is different @@ -360,13 +363,16 @@ static void setNumThreads( int numThreads ) { btGetTaskScheduler()->setNumThreads( newNumThreads ); } +#endif // #if BT_THREADSAFE } static void apiSelectButtonCallback(int buttonId, bool buttonState, void* userPointer) { +#if BT_THREADSAFE // change the task scheduler btSetTaskScheduler( gTaskSchedulerMgr.getTaskScheduler( buttonId ) ); setNumThreads( int( gSliderNumThreads ) ); +#endif // #if BT_THREADSAFE } static void setThreadCountCallback(float val, void* userPtr) @@ -392,6 +398,7 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld() #endif if ( gMultithreadedWorld ) { +#if BT_THREADSAFE m_dispatcher = NULL; btDefaultCollisionConstructionInfo cci; cci.m_defaultMaxPersistentManifoldPoolSize = 80000; @@ -416,6 +423,7 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld() m_dynamicsWorld = world; m_multithreadedWorld = true; btAssert( btGetTaskScheduler() != NULL ); +#endif // #if BT_THREADSAFE } else { @@ -534,6 +542,7 @@ void CommonRigidBodyMTBase::createDefaultParameters() } if (m_multithreadedWorld) { +#if BT_THREADSAFE // create a button for each supported threading API for ( int iApi = 0; iApi < gTaskSchedulerMgr.getNumTaskSchedulers(); ++iApi ) { @@ -558,6 +567,7 @@ void CommonRigidBodyMTBase::createDefaultParameters() slider.m_clampToIntegers = true; m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider ); } +#endif // #if BT_THREADSAFE } } @@ -589,6 +599,7 @@ void CommonRigidBodyMTBase::drawScreenText() { if ( m_multithreadedWorld ) { +#if BT_THREADSAFE int numManifolds = m_dispatcher->getNumManifolds(); int numContacts = 0; for ( int i = 0; i < numManifolds; ++i ) @@ -607,6 +618,7 @@ void CommonRigidBodyMTBase::drawScreenText() ); m_guiHelper->getAppInterface()->drawText( msg, 100, yCoord, 0.4f ); yCoord += yStep; +#endif // #if BT_THREADSAFE } { int sm = gSolverMode; diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp index faa8193dd..df6c17c5b 100644 --- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp +++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp @@ -114,7 +114,10 @@ struct InplaceSolverIslandCallbackMt : public btSimulationIslandManagerMt::Islan btConstraintSolverPoolMt::ThreadSolver* btConstraintSolverPoolMt::getAndLockThreadSolver() { - int i = btGetCurrentThreadIndex() % m_solvers.size(); + int i = 0; +#if BT_THREADSAFE + i = btGetCurrentThreadIndex() % m_solvers.size(); +#endif // #if BT_THREADSAFE while ( true ) { ThreadSolver& solver = m_solvers[ i ]; diff --git a/src/LinearMath/btThreads.cpp b/src/LinearMath/btThreads.cpp index 3c73255a1..85de7c4d2 100644 --- a/src/LinearMath/btThreads.cpp +++ b/src/LinearMath/btThreads.cpp @@ -465,6 +465,12 @@ bool btSpinMutex::tryLock() return true; } +// non-parallel version of btParallelFor +void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) +{ + btAssert(!"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE"); + body.forLoop( iBegin, iEnd ); +} #endif // #if BT_THREADSAFE From 34e2c1b7842cba6f460bf761d4d50c148f345397 Mon Sep 17 00:00:00 2001 From: Lunkhound Date: Mon, 29 May 2017 23:53:35 -0700 Subject: [PATCH 3/7] add profiling info --- .../Dynamics/btDiscreteDynamicsWorldMt.cpp | 27 ++++++++++--------- .../Dynamics/btSimulationIslandManagerMt.cpp | 2 ++ src/LinearMath/btThreads.cpp | 8 ++++++ 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp index df6c17c5b..1d10bad92 100644 --- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp +++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp @@ -284,26 +284,29 @@ struct UpdaterUnconstrainedMotion : public btIParallelForBody void btDiscreteDynamicsWorldMt::predictUnconstraintMotion( btScalar timeStep ) { BT_PROFILE( "predictUnconstraintMotion" ); - int grainSize = 50; // num of iterations per task for TBB - int bodyCount = m_nonStaticRigidBodies.size(); - UpdaterUnconstrainedMotion update; - update.timeStep = timeStep; - update.rigidBodies = bodyCount ? &m_nonStaticRigidBodies[ 0 ] : NULL; - btParallelFor( 0, bodyCount, grainSize, update ); + if ( m_nonStaticRigidBodies.size() > 0 ) + { + UpdaterUnconstrainedMotion update; + update.timeStep = timeStep; + update.rigidBodies = &m_nonStaticRigidBodies[ 0 ]; + int grainSize = 50; // num of iterations per task for task scheduler + btParallelFor( 0, m_nonStaticRigidBodies.size(), grainSize, update ); + } } void btDiscreteDynamicsWorldMt::createPredictiveContacts( btScalar timeStep ) { + BT_PROFILE( "createPredictiveContacts" ); releasePredictiveContacts(); - int grainSize = 50; // num of iterations per task for TBB or OPENMP - if ( int bodyCount = m_nonStaticRigidBodies.size() ) + if ( m_nonStaticRigidBodies.size() > 0 ) { UpdaterCreatePredictiveContacts update; update.world = this; update.timeStep = timeStep; update.rigidBodies = &m_nonStaticRigidBodies[ 0 ]; - btParallelFor( 0, bodyCount, grainSize, update ); + int grainSize = 50; // num of iterations per task for task scheduler + btParallelFor( 0, m_nonStaticRigidBodies.size(), grainSize, update ); } } @@ -311,14 +314,14 @@ void btDiscreteDynamicsWorldMt::createPredictiveContacts( btScalar timeStep ) void btDiscreteDynamicsWorldMt::integrateTransforms( btScalar timeStep ) { BT_PROFILE( "integrateTransforms" ); - int grainSize = 50; // num of iterations per task for TBB or OPENMP - if ( int bodyCount = m_nonStaticRigidBodies.size() ) + if ( m_nonStaticRigidBodies.size() > 0 ) { UpdaterIntegrateTransforms update; update.world = this; update.timeStep = timeStep; update.rigidBodies = &m_nonStaticRigidBodies[ 0 ]; - btParallelFor( 0, bodyCount, grainSize, update ); + int grainSize = 50; // num of iterations per task for task scheduler + btParallelFor( 0, m_nonStaticRigidBodies.size(), grainSize, update ); } } diff --git a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp index 6373f60ff..99b34353c 100644 --- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp +++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp @@ -548,6 +548,7 @@ void btSimulationIslandManagerMt::mergeIslands() void btSimulationIslandManagerMt::serialIslandDispatch( btAlignedObjectArray* islandsPtr, IslandCallback* callback ) { + BT_PROFILE( "serialIslandDispatch" ); // serial dispatch btAlignedObjectArray& islands = *islandsPtr; for ( int i = 0; i < islands.size(); ++i ) @@ -592,6 +593,7 @@ struct UpdateIslandDispatcher : public btIParallelForBody void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray* islandsPtr, IslandCallback* callback ) { + BT_PROFILE( "parallelIslandDispatch" ); int grainSize = 1; // iterations per task UpdateIslandDispatcher dispatcher; dispatcher.islandsPtr = islandsPtr; diff --git a/src/LinearMath/btThreads.cpp b/src/LinearMath/btThreads.cpp index 85de7c4d2..876bc4bb1 100644 --- a/src/LinearMath/btThreads.cpp +++ b/src/LinearMath/btThreads.cpp @@ -14,6 +14,7 @@ subject to the following restrictions: #include "btThreads.h" +#include "btQuickprof.h" #include // for min and max #if BT_THREADSAFE @@ -114,10 +115,12 @@ public: } virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE { + BT_PROFILE( "parallelFor_OpenMP" ); btPushThreadsAreRunning(); #pragma omp parallel for schedule( static, 1 ) for ( int i = iBegin; i < iEnd; i += grainSize ) { + BT_PROFILE( "OpenMP_job" ); body.forLoop( i, ( std::min )( i + grainSize, iEnd ) ); } btPopThreadsAreRunning(); @@ -174,11 +177,13 @@ public: void operator()( const tbb::blocked_range& range ) const { + BT_PROFILE( "TBB_job" ); mBody->forLoop( range.begin(), range.end() ); } }; virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE { + BT_PROFILE( "parallelFor_TBB" ); // TBB dispatch BodyAdapter tbbBody; tbbBody.mBody = &body; @@ -232,11 +237,13 @@ public: void operator()( int i ) const { + BT_PROFILE( "PPL_job" ); mBody->forLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) ); } }; virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE { + BT_PROFILE( "parallelFor_PPL" ); // PPL dispatch BodyAdapter pplBody; pplBody.mBody = &body; @@ -488,6 +495,7 @@ public: virtual void setNumThreads( int numThreads ) BT_OVERRIDE {} virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE { + BT_PROFILE( "parallelFor_sequential" ); body.forLoop( iBegin, iEnd ); } }; From ba88d332fb9c28689d615f11d934aa6125334082 Mon Sep 17 00:00:00 2001 From: Lunkhound Date: Mon, 29 May 2017 23:54:15 -0700 Subject: [PATCH 4/7] fix crash with out of range thread index --- examples/Utils/ChromeTraceUtil.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/Utils/ChromeTraceUtil.cpp b/examples/Utils/ChromeTraceUtil.cpp index 80139719f..02a892ba5 100644 --- a/examples/Utils/ChromeTraceUtil.cpp +++ b/examples/Utils/ChromeTraceUtil.cpp @@ -184,7 +184,7 @@ void MyEnterProfileZoneFunc(const char* msg) return; #ifndef BT_NO_PROFILE int threadId = btQuickprofGetCurrentThreadIndex2(); - if (threadId<0) + if (threadId<0 || threadId >= BT_QUICKPROF_MAX_THREAD_COUNT) return; if (gStackDepths[threadId] >= MAX_NESTING) @@ -208,8 +208,8 @@ void MyLeaveProfileZoneFunc() return; #ifndef BT_NO_PROFILE int threadId = btQuickprofGetCurrentThreadIndex2(); - if (threadId<0) - return; + if (threadId<0 || threadId >= BT_QUICKPROF_MAX_THREAD_COUNT) + return; if (gStackDepths[threadId] <= 0) { From 562858251ac1f180f28b2717728bfd5c10e289e5 Mon Sep 17 00:00:00 2001 From: Lunkhound Date: Tue, 30 May 2017 00:09:05 -0700 Subject: [PATCH 5/7] remove bad thread affinity mask setting --- examples/MultiThreading/b3Win32ThreadSupport.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/MultiThreading/b3Win32ThreadSupport.cpp b/examples/MultiThreading/b3Win32ThreadSupport.cpp index 539c0f3eb..c0987beab 100644 --- a/examples/MultiThreading/b3Win32ThreadSupport.cpp +++ b/examples/MultiThreading/b3Win32ThreadSupport.cpp @@ -279,9 +279,9 @@ void b3Win32ThreadSupport::startThreads(const Win32ThreadConstructionInfo& threa } - DWORD mask = 1; - mask = 1< Date: Tue, 30 May 2017 00:13:17 -0700 Subject: [PATCH 6/7] add task scheduler implemented with thread support interface --- examples/ExampleBrowser/CMakeLists.txt | 1 + .../CommonRigidBodyMTBase.cpp | 23 +- .../MultiThreadedDemo/MultiThreadedDemo.cpp | 5 +- examples/MultiThreading/btTaskScheduler.cpp | 448 ++++++++++++++++++ examples/MultiThreading/btTaskScheduler.h | 26 + 5 files changed, 498 insertions(+), 5 deletions(-) create mode 100644 examples/MultiThreading/btTaskScheduler.cpp create mode 100644 examples/MultiThreading/btTaskScheduler.h diff --git a/examples/ExampleBrowser/CMakeLists.txt b/examples/ExampleBrowser/CMakeLists.txt index 4a4db93db..9382f41f6 100644 --- a/examples/ExampleBrowser/CMakeLists.txt +++ b/examples/ExampleBrowser/CMakeLists.txt @@ -220,6 +220,7 @@ SET(BulletExampleBrowser_SRCS ../MultiThreading/b3PosixThreadSupport.cpp ../MultiThreading/b3Win32ThreadSupport.cpp ../MultiThreading/b3ThreadSupportInterface.cpp + ../MultiThreading/btTaskScheduler.cpp ../RenderingExamples/TinyRendererSetup.cpp ../RenderingExamples/TimeSeriesCanvas.cpp ../RenderingExamples/TimeSeriesCanvas.h diff --git a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp index a47b76da6..83eecca61 100644 --- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp +++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp @@ -35,6 +35,7 @@ class btCollisionShape; #include "BulletDynamics/MLCPSolvers/btSolveProjectedGaussSeidel.h" #include "BulletDynamics/MLCPSolvers/btDantzigSolver.h" #include "BulletDynamics/MLCPSolvers/btLemkeSolver.h" +#include "../MultiThreading/btTaskScheduler.h" static int gNumIslands = 0; @@ -173,7 +174,7 @@ void myParallelIslandDispatch( btAlignedObjectArray m_taskSchedulers; + btAlignedObjectArray m_allocatedTaskSchedulers; public: btTaskSchedulerManager() {} @@ -251,6 +253,11 @@ public: { addTaskScheduler( btGetSequentialTaskScheduler() ); #if BT_THREADSAFE + if ( btITaskScheduler* ts = createDefaultTaskScheduler() ) + { + m_allocatedTaskSchedulers.push_back( ts ); + addTaskScheduler( ts ); + } addTaskScheduler( btGetOpenMPTaskScheduler() ); addTaskScheduler( btGetTBBTaskScheduler() ); addTaskScheduler( btGetPPLTaskScheduler() ); @@ -263,9 +270,16 @@ public: { btSetTaskScheduler( m_taskSchedulers[ 0 ] ); } - btGetTaskScheduler()->setNumThreads( btGetTaskScheduler()->getMaxNumThreads() ); #endif // #if BT_THREADSAFE } + void shutdown() + { + for ( int i = 0; i < m_allocatedTaskSchedulers.size(); ++i ) + { + delete m_allocatedTaskSchedulers[ i ]; + } + m_allocatedTaskSchedulers.clear(); + } void addTaskScheduler( btITaskScheduler* ts ) { @@ -281,8 +295,13 @@ public: static btTaskSchedulerManager gTaskSchedulerMgr; +#if BT_THREADSAFE +static bool gMultithreadedWorld = true; +static bool gDisplayProfileInfo = true; +#else static bool gMultithreadedWorld = false; static bool gDisplayProfileInfo = false; +#endif static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE; static int gSolverMode = SOLVER_SIMD | SOLVER_USE_WARMSTARTING | diff --git a/examples/MultiThreadedDemo/MultiThreadedDemo.cpp b/examples/MultiThreadedDemo/MultiThreadedDemo.cpp index 3dff09ae8..96c856f53 100644 --- a/examples/MultiThreadedDemo/MultiThreadedDemo.cpp +++ b/examples/MultiThreadedDemo/MultiThreadedDemo.cpp @@ -28,11 +28,10 @@ class btCollisionShape; #include "btBulletCollisionCommon.h" -#define BT_OVERRIDE static btScalar gSliderStackRows = 8.0f; static btScalar gSliderStackColumns = 6.0f; -static btScalar gSliderStackHeight = 15.0f; +static btScalar gSliderStackHeight = 10.0f; static btScalar gSliderGroundHorizontalAmplitude = 0.0f; static btScalar gSliderGroundVerticalAmplitude = 0.0f; @@ -240,7 +239,7 @@ void MultiThreadedDemo::createSceneObjects() const btVector3 halfExtents = btVector3( 0.5f, 0.25f, 0.5f ); int numStackRows = btMax(1, int(gSliderStackRows)); int numStackCols = btMax(1, int(gSliderStackColumns)); - int stackHeight = 15; + int stackHeight = int(gSliderStackHeight); float stackZSpacing = 3.0f; float stackXSpacing = 20.0f; diff --git a/examples/MultiThreading/btTaskScheduler.cpp b/examples/MultiThreading/btTaskScheduler.cpp new file mode 100644 index 000000000..c4de30ebc --- /dev/null +++ b/examples/MultiThreading/btTaskScheduler.cpp @@ -0,0 +1,448 @@ + +#include "LinearMath/btTransform.h" +#include "../Utils/b3Clock.h" +#include "LinearMath/btAlignedObjectArray.h" +#include "LinearMath/btThreads.h" +#include "LinearMath/btQuickprof.h" +#include +#include + + +typedef void( *btThreadFunc )( void* userPtr, void* lsMemory ); +typedef void* ( *btThreadLocalStorageFunc )(); + +#if BT_THREADSAFE + +#if defined( _WIN32 ) + +#include "b3Win32ThreadSupport.h" + +b3ThreadSupportInterface* createThreadSupport( int numThreads, btThreadFunc threadFunc, btThreadLocalStorageFunc localStoreFunc, const char* uniqueName ) +{ + b3Win32ThreadSupport::Win32ThreadConstructionInfo constructionInfo( uniqueName, threadFunc, localStoreFunc, numThreads ); + //constructionInfo.m_priority = 0; // highest priority (the default) -- can cause erratic performance when numThreads > numCores + // we don't want worker threads to be higher priority than the main thread or the main thread could get + // totally shut out and unable to tell the workers to stop + constructionInfo.m_priority = -1; // normal priority + b3Win32ThreadSupport* threadSupport = new b3Win32ThreadSupport( constructionInfo ); + return threadSupport; +} + +#else // #if defined( _WIN32 ) + +#include "b3PosixThreadSupport.h" + +b3ThreadSupportInterface* createThreadSupport( int numThreads, btThreadFunc threadFunc, btThreadLocalStorageFunc localStoreFunc, const char* uniqueName) +{ + b3PosixThreadSupport::ThreadConstructionInfo constructionInfo( uniqueName, threadFunc, localStoreFunc, numThreads ); + b3ThreadSupportInterface* threadSupport = new b3PosixThreadSupport( constructionInfo ); + return threadSupport; +} + +#endif // #else // #if defined( _WIN32 ) + + +/// +/// getNumHardwareThreads() +/// +/// +/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine +/// +#if __cplusplus >= 201103L + +#include + +int getNumHardwareThreads() +{ + return std::thread::hardware_concurrency(); +} + +#elif defined( _WIN32 ) + +#define WIN32_LEAN_AND_MEAN + +#include + +int getNumHardwareThreads() +{ + // caps out at 32 + SYSTEM_INFO info; + GetSystemInfo( &info ); + return info.dwNumberOfProcessors; +} + +#else + +int getNumHardwareThreads() +{ + return 0; // don't know +} + +#endif + + +struct WorkerThreadStatus +{ + enum Type + { + kInvalid, + kWaitingForWork, + kWorking, + kSleeping, + }; +}; + + +struct IJob +{ + virtual void executeJob() = 0; +}; + +class ParallelForJob : public IJob +{ + const btIParallelForBody* mBody; + int mBegin; + int mEnd; + +public: + ParallelForJob() + { + mBody = NULL; + mBegin = 0; + mEnd = 0; + } + void init( int iBegin, int iEnd, const btIParallelForBody& body ) + { + mBody = &body; + mBegin = iBegin; + mEnd = iEnd; + } + virtual void executeJob() override + { + BT_PROFILE( "executeJob" ); + + // call the functor body to do the work + mBody->forLoop( mBegin, mEnd ); + } +}; + + +struct JobContext +{ + JobContext() + { + m_queueLock = NULL; + m_headIndex = 0; + m_tailIndex = 0; + m_workersShouldCheckQueue = false; + m_useSpinMutex = false; + } + b3CriticalSection* m_queueLock; + btSpinMutex m_mutex; + volatile bool m_workersShouldCheckQueue; + + btAlignedObjectArray m_jobQueue; + bool m_queueIsEmpty; + int m_tailIndex; + int m_headIndex; + bool m_useSpinMutex; + + void lockQueue() + { + if ( m_useSpinMutex ) + { + m_mutex.lock(); + } + else + { + m_queueLock->lock(); + } + } + void unlockQueue() + { + if ( m_useSpinMutex ) + { + m_mutex.unlock(); + } + else + { + m_queueLock->unlock(); + } + } + void clearQueue() + { + lockQueue(); + m_headIndex = 0; + m_tailIndex = 0; + m_queueIsEmpty = true; + unlockQueue(); + m_jobQueue.resizeNoInitialize( 0 ); + } + void submitJob( IJob* job ) + { + m_jobQueue.push_back( job ); + lockQueue(); + m_tailIndex++; + m_queueIsEmpty = false; + unlockQueue(); + } + IJob* consumeJob() + { + if ( m_queueIsEmpty ) + { + // lock free path. even if this is taken erroneously it isn't harmful + return NULL; + } + IJob* job = NULL; + lockQueue(); + if ( !m_queueIsEmpty ) + { + job = m_jobQueue[ m_headIndex++ ]; + if ( m_headIndex == m_tailIndex ) + { + m_queueIsEmpty = true; + } + } + unlockQueue(); + return job; + } +}; + + +struct WorkerThreadLocalStorage +{ + int threadId; + WorkerThreadStatus::Type status; +}; + + +static void WorkerThreadFunc( void* userPtr, void* lsMemory ) +{ + BT_PROFILE( "WorkerThreadFunc" ); + WorkerThreadLocalStorage* localStorage = (WorkerThreadLocalStorage*) lsMemory; + localStorage->status = WorkerThreadStatus::kWaitingForWork; + //printf( "WorkerThreadFunc: worker %d start working\n", localStorage->threadId ); + + JobContext* jobContext = (JobContext*) userPtr; + + while ( jobContext->m_workersShouldCheckQueue ) + { + if ( IJob* job = jobContext->consumeJob() ) + { + localStorage->status = WorkerThreadStatus::kWorking; + job->executeJob(); + localStorage->status = WorkerThreadStatus::kWaitingForWork; + } + else + { + // todo: spin wait a bit to avoid hammering the empty queue + } + } + + //printf( "WorkerThreadFunc stop working\n" ); + localStorage->status = WorkerThreadStatus::kSleeping; + // go idle +} + + +static void* WorkerThreadAllocFunc() +{ + return new WorkerThreadLocalStorage; +} + + + +class btTaskSchedulerDefault : public btITaskScheduler +{ + JobContext m_jobContext; + b3ThreadSupportInterface* m_threadSupport; + btAlignedObjectArray m_jobs; + btSpinMutex m_antiNestingLock; // prevent nested parallel-for + int m_numThreads; + int m_numWorkerThreads; + int m_numWorkersRunning; +public: + + btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport") + { + m_threadSupport = NULL; + m_numThreads = getNumHardwareThreads(); + // if can't detect number of cores, + if ( m_numThreads == 0 ) + { + // take a guess + m_numThreads = 4; + } + m_numWorkerThreads = m_numThreads - 1; + m_numWorkersRunning = 0; + } + + virtual ~btTaskSchedulerDefault() + { + shutdown(); + } + + void init() + { + int maxNumWorkerThreads = BT_MAX_THREAD_COUNT - 1; + m_threadSupport = createThreadSupport( maxNumWorkerThreads, WorkerThreadFunc, WorkerThreadAllocFunc, "TaskScheduler" ); + m_jobContext.m_queueLock = m_threadSupport->createCriticalSection(); + for ( int i = 0; i < maxNumWorkerThreads; i++ ) + { + WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i ); + btAssert( storage ); + storage->threadId = i; + storage->status = WorkerThreadStatus::kSleeping; + } + setWorkersActive( false ); // no work for them yet + } + + virtual void shutdown() + { + setWorkersActive( false ); + waitForWorkersToSleep(); + m_threadSupport->deleteCriticalSection( m_jobContext.m_queueLock ); + m_jobContext.m_queueLock = NULL; + + delete m_threadSupport; + m_threadSupport = NULL; + } + + void setWorkersActive( bool active ) + { + m_jobContext.m_workersShouldCheckQueue = active; + } + + virtual int getMaxNumThreads() const BT_OVERRIDE + { + return BT_MAX_THREAD_COUNT; + } + + virtual int getNumThreads() const BT_OVERRIDE + { + return m_numThreads; + } + + virtual void setNumThreads( int numThreads ) BT_OVERRIDE + { + m_numThreads = btMax( btMin(numThreads, int(BT_MAX_THREAD_COUNT)), 1 ); + m_numWorkerThreads = m_numThreads - 1; + } + + void waitJobs() + { + BT_PROFILE( "waitJobs" ); + // have the main thread work until the job queue is empty + for ( ;; ) + { + if ( IJob* job = m_jobContext.consumeJob() ) + { + job->executeJob(); + } + else + { + break; + } + } + // done with jobs for now, tell workers to rest + setWorkersActive( false ); + waitForWorkersToSleep(); + } + + void wakeWorkers() + { + BT_PROFILE( "wakeWorkers" ); + btAssert( m_jobContext.m_workersShouldCheckQueue ); + // tell each worker thread to start working + for ( int i = 0; i < m_numWorkerThreads; i++ ) + { + m_threadSupport->runTask( B3_THREAD_SCHEDULE_TASK, &m_jobContext, i ); + m_numWorkersRunning++; + } + } + + void waitForWorkersToSleep() + { + BT_PROFILE( "waitForWorkersToSleep" ); + while ( m_numWorkersRunning > 0 ) + { + int iThread; + int threadStatus; + m_threadSupport->waitForResponse( &iThread, &threadStatus ); // wait for worker threads to finish working + m_numWorkersRunning--; + } + //m_threadSupport->waitForAllTasksToComplete(); + for ( int i = 0; i < m_numWorkerThreads; i++ ) + { + //m_threadSupport->waitForTaskCompleted( i ); + WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i ); + btAssert( storage ); + btAssert( storage->status == WorkerThreadStatus::kSleeping ); + } + } + + virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE + { + BT_PROFILE( "parallelFor_ThreadSupport" ); + btAssert( iEnd >= iBegin ); + btAssert( grainSize >= 1 ); + int iterationCount = iEnd - iBegin; + if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() ) + { + int jobCount = ( iterationCount + grainSize - 1 ) / grainSize; + btAssert( jobCount >= 2 ); // need more than one job for multithreading + if ( jobCount > m_jobs.size() ) + { + m_jobs.resize( jobCount ); + } + if ( jobCount > m_jobContext.m_jobQueue.capacity() ) + { + m_jobContext.m_jobQueue.reserve( jobCount ); + } + + m_jobContext.clearQueue(); + // prepare worker threads for incoming work + setWorkersActive( true ); + wakeWorkers(); + // submit all of the jobs + int iJob = 0; + for ( int i = iBegin; i < iEnd; i += grainSize ) + { + btAssert( iJob < jobCount ); + int iE = btMin( i + grainSize, iEnd ); + ParallelForJob& job = m_jobs[ iJob ]; + job.init( i, iE, body ); + m_jobContext.submitJob( &job ); + iJob++; + } + + // put the main thread to work on emptying the job queue and then wait for all workers to finish + waitJobs(); + m_antiNestingLock.unlock(); + } + else + { + BT_PROFILE( "parallelFor_mainThread" ); + // just run on main thread + body.forLoop( iBegin, iEnd ); + } + } +}; + + + +btITaskScheduler* createDefaultTaskScheduler() +{ + btTaskSchedulerDefault* ts = new btTaskSchedulerDefault(); + ts->init(); + return ts; +} + +#else // #if BT_THREADSAFE + +btITaskScheduler* createDefaultTaskScheduler() +{ + return NULL; +} + +#endif // #else // #if BT_THREADSAFE \ No newline at end of file diff --git a/examples/MultiThreading/btTaskScheduler.h b/examples/MultiThreading/btTaskScheduler.h new file mode 100644 index 000000000..a83b635eb --- /dev/null +++ b/examples/MultiThreading/btTaskScheduler.h @@ -0,0 +1,26 @@ +/* +Copyright (c) 2003-2014 Erwin Coumans http://bullet.googlecode.com + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + + +#ifndef BT_TASK_SCHEDULER_H +#define BT_TASK_SCHEDULER_H + + +class btITaskScheduler; + +btITaskScheduler* createDefaultTaskScheduler(); + + +#endif // BT_TASK_SCHEDULER_H From cc1e6dc26902a4a9965163eb3a51b5fd3dc7b2c8 Mon Sep 17 00:00:00 2001 From: Lunkhound Date: Tue, 30 May 2017 00:47:41 -0700 Subject: [PATCH 7/7] make MultiThreadedDemo available if threadsafe --- examples/ExampleBrowser/ExampleEntries.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ExampleBrowser/ExampleEntries.cpp b/examples/ExampleBrowser/ExampleEntries.cpp index 04b2112c3..c4cb5bbf5 100644 --- a/examples/ExampleBrowser/ExampleEntries.cpp +++ b/examples/ExampleBrowser/ExampleEntries.cpp @@ -295,7 +295,7 @@ static ExampleEntry gDefaultExamples[]= ExampleEntry(1,"Fracture demo", "Create a basic custom implementation to model fracturing objects, based on a btCompoundShape. It explicitly propagates the collision impulses and breaks the rigid body into multiple rigid bodies. Press F to toggle fracture and glue mode.", FractureDemoCreateFunc), ExampleEntry(1,"Planar 2D","Show the use of 2D collision shapes and rigid body simulation. The collision shape is wrapped into a btConvex2dShape. The rigid bodies are restricted in a plane using the 'setAngularFactor' and 'setLinearFactor' API call.",Planar2DCreateFunc), -#if BT_USE_OPENMP || BT_USE_TBB || BT_USE_PPL +#if BT_THREADSAFE // only enable MultiThreaded demo if a task scheduler is available ExampleEntry( 1, "Multithreaded Demo", "Stacks of boxes that do not sleep. Good for testing performance with large numbers of bodies and contacts. Sliders can be used to change the number of stacks (restart needed after each change)."