Bullet 2 threading refactor: moved parallel-for calls into core libs

2017-05-22 00:47:11 -07:00
parent 2f3844e5db
commit dfe184e8d3
14 changed files with 1012 additions and 847 deletions
--- a/examples/Benchmarks/BenchmarkDemo.cpp
+++ b/examples/Benchmarks/BenchmarkDemo.cpp
@@ -32,7 +32,6 @@ subject to the following restrictions:

 #include "LinearMath/btAlignedObjectArray.h"
 #include "LinearMath/btTransform.h"
-#include "../MultiThreadedDemo/ParallelFor.h"

 class btDynamicsWorld;

@@ -230,7 +229,7 @@ public:
        }
    }

-    struct CastRaysLoopBody
+    struct CastRaysLoopBody : public btIParallelForBody
    {
        btCollisionWorld* mWorld;
 		btRaycastBar2* mRaycasts;
@@ -274,7 +273,7 @@ public:
        {
            CastRaysLoopBody rayLooper(cw, this);
            int grainSize = 20;  // number of raycasts per task
-            parallelFor( 0, NUMRAYS, grainSize, rayLooper );
+            btParallelFor( 0, NUMRAYS, grainSize, rayLooper );
        }
        else
 #endif // USE_PARALLEL_RAYCASTS
--- a/examples/ExampleBrowser/CMakeLists.txt
+++ b/examples/ExampleBrowser/CMakeLists.txt
@@ -110,29 +110,6 @@ ELSE(WIN32)
 	ENDIF(APPLE)
 ENDIF(WIN32)

-IF (BULLET2_MULTITHREADED_OPEN_MP_DEMO)
-    ADD_DEFINITIONS("-DBT_USE_OPENMP=1")
-    IF (MSVC)
-        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
-    ELSE (MSVC)
-        # GCC, Clang
-        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-    ENDIF (MSVC)
-ENDIF (BULLET2_MULTITHREADED_OPEN_MP_DEMO)
-
-IF (BULLET2_MULTITHREADED_PPL_DEMO)
-    ADD_DEFINITIONS("-DBT_USE_PPL=1")
-ENDIF (BULLET2_MULTITHREADED_PPL_DEMO)
-
-IF (BULLET2_MULTITHREADED_TBB_DEMO)
-    SET (BULLET2_TBB_INCLUDE_DIR "not found" CACHE PATH "Directory for Intel TBB includes.")
-    SET (BULLET2_TBB_LIB_DIR "not found" CACHE PATH "Directory for Intel TBB libraries.")
-    find_library(TBB_LIBRARY tbb PATHS ${BULLET2_TBB_LIB_DIR})
-    find_library(TBBMALLOC_LIBRARY tbbmalloc PATHS ${BULLET2_TBB_LIB_DIR})
-    ADD_DEFINITIONS("-DBT_USE_TBB=1")
-    INCLUDE_DIRECTORIES( ${BULLET2_TBB_INCLUDE_DIR} )
-    LINK_LIBRARIES( ${TBB_LIBRARY} ${TBBMALLOC_LIBRARY} )
-ENDIF (BULLET2_MULTITHREADED_TBB_DEMO)

 SET(ExtendedTutorialsSources
 	../ExtendedTutorials/Chain.cpp 
@@ -207,7 +184,6 @@ SET(BulletExampleBrowser_SRCS
 	../MultiThreadedDemo/MultiThreadedDemo.h
 	../MultiThreadedDemo/CommonRigidBodyMTBase.cpp
 	../MultiThreadedDemo/CommonRigidBodyMTBase.h
-	../MultiThreadedDemo/ParallelFor.h
 	../Tutorial/Tutorial.cpp
 	../Tutorial/Tutorial.h
 	../Tutorial/Dof6ConstraintTutorial.cpp
@@ -386,7 +362,7 @@ ADD_CUSTOM_COMMAND(
                COMMAND ${CMAKE_COMMAND} ARGS -E copy_directory ${BULLET_PHYSICS_SOURCE_DIR}/data ${PROJECT_BINARY_DIR}/data
        )

-IF (BULLET2_MULTITHREADED_TBB_DEMO AND WIN32)
+IF (BULLET2_USE_TBB_MULTITHREADING AND WIN32)
    # add a post build command to copy some dlls to the executable directory
    set(TBB_VC_VER "vc12")
    set(TBB_VC_ARCH "ia32")
@@ -400,7 +376,7 @@ IF (BULLET2_MULTITHREADED_TBB_DEMO AND WIN32)
    COMMAND ${CMAKE_COMMAND} -E copy_if_different
        "${BULLET2_TBB_INCLUDE_DIR}/../bin/${TBB_VC_ARCH}/${TBB_VC_VER}/tbbmalloc.dll"
        $<TARGET_FILE_DIR:App_ExampleBrowser>)
-ENDIF (BULLET2_MULTITHREADED_TBB_DEMO AND WIN32)
+ENDIF (BULLET2_USE_TBB_MULTITHREADING AND WIN32)


 IF (INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES)
--- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
+++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
@@ -23,10 +23,10 @@ class btCollisionShape;

 #include "CommonRigidBodyMTBase.h"
 #include "../CommonInterfaces/CommonParameterInterface.h"
-#include "ParallelFor.h"
 #include "LinearMath/btAlignedObjectArray.h"
 #include "LinearMath/btPoolAllocator.h"
 #include "btBulletCollisionCommon.h"
+#include "BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h"
 #include "BulletDynamics/Dynamics/btSimulationIslandManagerMt.h"  // for setSplitIslands()
 #include "BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h"
 #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h"
@@ -36,20 +36,6 @@ class btCollisionShape;
 #include "BulletDynamics/MLCPSolvers/btDantzigSolver.h"
 #include "BulletDynamics/MLCPSolvers/btLemkeSolver.h"

-TaskManager gTaskMgr;
-
-#define USE_PARALLEL_NARROWPHASE 1  // detect collisions in parallel
-#define USE_PARALLEL_ISLAND_SOLVER 1   // solve simulation islands in parallel
-#define USE_PARALLEL_CREATE_PREDICTIVE_CONTACTS 1
-#define USE_PARALLEL_INTEGRATE_TRANSFORMS 1
-#define USE_PARALLEL_PREDICT_UNCONSTRAINED_MOTION 1
-
-#if defined (_MSC_VER) && _MSC_VER >= 1600
-// give us a compile error if any signatures of overriden methods is changed
-#define BT_OVERRIDE override
-#else
-#define BT_OVERRIDE
-#endif

 static int gNumIslands = 0;

@@ -124,7 +110,7 @@ public:
 };


-Profiler gProfiler;
+static Profiler gProfiler;

 class ProfileHelper
 {
@@ -141,457 +127,84 @@ public:
    }
 };

-int gThreadsRunningCounter = 0;
-btSpinMutex gThreadsRunningCounterMutex;
-
-void btPushThreadsAreRunning()
+static void profileBeginCallback( btDynamicsWorld *world, btScalar timeStep )
 {
-    gThreadsRunningCounterMutex.lock();
-    gThreadsRunningCounter++;
-    gThreadsRunningCounterMutex.unlock();
+    gProfiler.begin( Profiler::kRecordInternalTimeStep );
 }

-void btPopThreadsAreRunning()
+static void profileEndCallback( btDynamicsWorld *world, btScalar timeStep )
 {
-    gThreadsRunningCounterMutex.lock();
-    gThreadsRunningCounter--;
-    gThreadsRunningCounterMutex.unlock();
-}
-
-bool btThreadsAreRunning()
-{
-    return gThreadsRunningCounter != 0;
+    gProfiler.end( Profiler::kRecordInternalTimeStep );
 }


-#if USE_PARALLEL_NARROWPHASE
-
-class MyCollisionDispatcher : public btCollisionDispatcher
+///
+/// MyCollisionDispatcher -- subclassed for profiling purposes
+///
+class MyCollisionDispatcher : public btCollisionDispatcherMt
 {
-    btSpinMutex m_manifoldPtrsMutex;
-
+    typedef btCollisionDispatcherMt ParentClass;
 public:
-    MyCollisionDispatcher( btCollisionConfiguration* config ) : btCollisionDispatcher( config )
+    MyCollisionDispatcher( btCollisionConfiguration* config, int grainSize ) : btCollisionDispatcherMt( config, grainSize )
    {
    }

-    virtual ~MyCollisionDispatcher()
-    {
-    }
-
-    btPersistentManifold* getNewManifold( const btCollisionObject* body0, const btCollisionObject* body1 ) BT_OVERRIDE
-    {
-        // added spin-locks
-        //optional relative contact breaking threshold, turned on by default (use setDispatcherFlags to switch off feature for improved performance)
-
-        btScalar contactBreakingThreshold = ( m_dispatcherFlags & btCollisionDispatcher::CD_USE_RELATIVE_CONTACT_BREAKING_THRESHOLD ) ?
-            btMin( body0->getCollisionShape()->getContactBreakingThreshold( gContactBreakingThreshold ), body1->getCollisionShape()->getContactBreakingThreshold( gContactBreakingThreshold ) )
-            : gContactBreakingThreshold;
-
-        btScalar contactProcessingThreshold = btMin( body0->getContactProcessingThreshold(), body1->getContactProcessingThreshold() );
-
-        void* mem = m_persistentManifoldPoolAllocator->allocate( sizeof( btPersistentManifold ) );
-        if (NULL == mem)
-        {
-            //we got a pool memory overflow, by default we fallback to dynamically allocate memory. If we require a contiguous contact pool then assert.
-            if ( ( m_dispatcherFlags&CD_DISABLE_CONTACTPOOL_DYNAMIC_ALLOCATION ) == 0 )
-            {
-                mem = btAlignedAlloc( sizeof( btPersistentManifold ), 16 );
-            }
-            else
-            {
-                btAssert( 0 );
-                //make sure to increase the m_defaultMaxPersistentManifoldPoolSize in the btDefaultCollisionConstructionInfo/btDefaultCollisionConfiguration
-                return 0;
-            }
-        }
-        btPersistentManifold* manifold = new(mem) btPersistentManifold( body0, body1, 0, contactBreakingThreshold, contactProcessingThreshold );
-        m_manifoldPtrsMutex.lock();
-        manifold->m_index1a = m_manifoldsPtr.size();
-        m_manifoldsPtr.push_back( manifold );
-        m_manifoldPtrsMutex.unlock();
-
-        return manifold;
-    }
-
-    void releaseManifold( btPersistentManifold* manifold ) BT_OVERRIDE
-    {
-        clearManifold( manifold );
-
-        m_manifoldPtrsMutex.lock();
-        int findIndex = manifold->m_index1a;
-        btAssert( findIndex < m_manifoldsPtr.size() );
-        m_manifoldsPtr.swap( findIndex, m_manifoldsPtr.size() - 1 );
-        m_manifoldsPtr[ findIndex ]->m_index1a = findIndex;
-        m_manifoldsPtr.pop_back();
-        m_manifoldPtrsMutex.unlock();
-
-        manifold->~btPersistentManifold();
-        if ( m_persistentManifoldPoolAllocator->validPtr( manifold ) )
-        {
-            m_persistentManifoldPoolAllocator->freeMemory( manifold );
-        }
-        else
-        {
-            btAlignedFree( manifold );
-        }
-    }
-
-    struct Updater
-    {
-        btBroadphasePair* mPairArray;
-        btNearCallback mCallback;
-        btCollisionDispatcher* mDispatcher;
-        const btDispatcherInfo* mInfo;
-
-        Updater()
-        {
-            mPairArray = NULL;
-            mCallback = NULL;
-            mDispatcher = NULL;
-            mInfo = NULL;
-        }
-        void forLoop( int iBegin, int iEnd ) const
-        {
-            for ( int i = iBegin; i < iEnd; ++i )
-            {
-                btBroadphasePair* pair = &mPairArray[ i ];
-                mCallback( *pair, *mDispatcher, *mInfo );
-            }
-        }
-    };
-
    virtual void dispatchAllCollisionPairs( btOverlappingPairCache* pairCache, const btDispatcherInfo& info, btDispatcher* dispatcher ) BT_OVERRIDE
    {
-        ProfileHelper prof(Profiler::kRecordDispatchAllCollisionPairs);
-        int grainSize = 40;  // iterations per task
-        int pairCount = pairCache->getNumOverlappingPairs();
-        Updater updater;
-        updater.mCallback = getNearCallback();
-        updater.mPairArray = pairCount > 0 ? pairCache->getOverlappingPairArrayPtr() : NULL;
-        updater.mDispatcher = this;
-        updater.mInfo = &info;
-
-        btPushThreadsAreRunning();
-        parallelFor( 0, pairCount, grainSize, updater );
-        btPopThreadsAreRunning();
-        
-        if (m_manifoldsPtr.size() < 1)
-            return;
-
-        // reconstruct the manifolds array to ensure determinism
-        m_manifoldsPtr.resizeNoInitialize(0);
-        btBroadphasePair* pairs = pairCache->getOverlappingPairArrayPtr();
-        for (int i = 0; i < pairCount; ++i)
-        {
-            btCollisionAlgorithm* algo = pairs[i].m_algorithm;
-            if (algo) algo->getAllContactManifolds(m_manifoldsPtr);
-        }
-
-        // update the indices (used when releasing manifolds)
-        for (int i = 0; i < m_manifoldsPtr.size(); ++i)
-            m_manifoldsPtr[i]->m_index1a = i;
+        ProfileHelper prof( Profiler::kRecordDispatchAllCollisionPairs );
+        ParentClass::dispatchAllCollisionPairs( pairCache, info, dispatcher );
    }
 };

-#endif

-
-#if USE_PARALLEL_ISLAND_SOLVER
 ///
-/// MyConstraintSolverPool - masquerades as a constraint solver, but really it is a threadsafe pool of them.
-///
-///  Each solver in the pool is protected by a mutex.  When solveGroup is called from a thread,
-///  the pool looks for a solver that isn't being used by another thread, locks it, and dispatches the
-///  call to the solver.
-///  So long as there are at least as many solvers as there are hardware threads, it should never need to
-///  spin wait.
-///
-class MyConstraintSolverPool : public btConstraintSolver
+/// myParallelIslandDispatch -- wrap default parallel dispatch for profiling and to get the number of simulation islands
+//
+void myParallelIslandDispatch( btAlignedObjectArray<btSimulationIslandManagerMt::Island*>* islandsPtr, btSimulationIslandManagerMt::IslandCallback* callback )
 {
-    const static size_t kCacheLineSize = 128;
-    struct ThreadSolver
-    {
-        btConstraintSolver* solver;
-        btSpinMutex mutex;
-        char _cachelinePadding[ kCacheLineSize - sizeof( btSpinMutex ) - sizeof( void* ) ];  // keep mutexes from sharing a cache line
-    };
-    btAlignedObjectArray<ThreadSolver> m_solvers;
-    btConstraintSolverType m_solverType;
-
-    ThreadSolver* getAndLockThreadSolver()
-    {
-        while ( true )
-        {
-            for ( int i = 0; i < m_solvers.size(); ++i )
-            {
-                ThreadSolver& solver = m_solvers[ i ];
-                if ( solver.mutex.tryLock() )
-                {
-                    return &solver;
-                }
-            }
-        }
-        return NULL;
-    }
-    void init( btConstraintSolver** solvers, int numSolvers )
-    {
-        m_solverType = BT_SEQUENTIAL_IMPULSE_SOLVER;
-        m_solvers.resize( numSolvers );
-        for ( int i = 0; i < numSolvers; ++i )
-        {
-            m_solvers[ i ].solver = solvers[ i ];
-        }
-        if ( numSolvers > 0 )
-        {
-            m_solverType = solvers[ 0 ]->getSolverType();
-        }
-    }
-public:
-    // create the solvers for me
-    explicit MyConstraintSolverPool( int numSolvers )
-    {
-        btAlignedObjectArray<btConstraintSolver*> solvers;
-        solvers.reserve( numSolvers );
-        for ( int i = 0; i < numSolvers; ++i )
-        {
-            btConstraintSolver* solver = new btSequentialImpulseConstraintSolver();
-            solvers.push_back( solver );
-        }
-        init( &solvers[ 0 ], numSolvers );
-    }
-
-    // pass in fully constructed solvers (destructor will delete them)
-    MyConstraintSolverPool( btConstraintSolver** solvers, int numSolvers )
-    {
-        init( solvers, numSolvers );
-    }
-    virtual ~MyConstraintSolverPool()
-    {
-        // delete all solvers
-        for ( int i = 0; i < m_solvers.size(); ++i )
-        {
-            ThreadSolver& solver = m_solvers[ i ];
-            delete solver.solver;
-            solver.solver = NULL;
-        }
-    }
-
-    //virtual void prepareSolve( int /* numBodies */, int /* numManifolds */ ) { ; } // does nothing
-
-    ///solve a group of constraints
-    virtual btScalar solveGroup( btCollisionObject** bodies,
-                                 int numBodies,
-                                 btPersistentManifold** manifolds,
-                                 int numManifolds,
-                                 btTypedConstraint** constraints,
-                                 int numConstraints,
-                                 const btContactSolverInfo& info,
-                                 btIDebugDraw* debugDrawer,
-                                 btDispatcher* dispatcher
-                                 )
-    {
-        ThreadSolver* solver = getAndLockThreadSolver();
-        solver->solver->solveGroup( bodies, numBodies, manifolds, numManifolds, constraints, numConstraints, info, debugDrawer, dispatcher );
-        solver->mutex.unlock();
-        return 0.0f;
-    }
-
-    //virtual void allSolved( const btContactSolverInfo& /* info */, class btIDebugDraw* /* debugDrawer */ ) { ; } // does nothing
-
-    ///clear internal cached data and reset random seed
-    virtual	void reset()
-    {
-        for ( int i = 0; i < m_solvers.size(); ++i )
-        {
-            ThreadSolver& solver = m_solvers[ i ];
-            solver.mutex.lock();
-            solver.solver->reset();
-            solver.mutex.unlock();
-        }
-    }
-
-    virtual btConstraintSolverType getSolverType() const
-    {
-        return m_solverType;
-    }
-};
-
-struct UpdateIslandDispatcher
-{
-    btAlignedObjectArray<btSimulationIslandManagerMt::Island*>* islandsPtr;
-    btSimulationIslandManagerMt::IslandCallback* callback;
-
-    void forLoop( int iBegin, int iEnd ) const
-    {
-        for ( int i = iBegin; i < iEnd; ++i )
-        {
-            btSimulationIslandManagerMt::Island* island = ( *islandsPtr )[ i ];
-            btPersistentManifold** manifolds = island->manifoldArray.size() ? &island->manifoldArray[ 0 ] : NULL;
-            btTypedConstraint** constraintsPtr = island->constraintArray.size() ? &island->constraintArray[ 0 ] : NULL;
-            callback->processIsland( &island->bodyArray[ 0 ],
-                                     island->bodyArray.size(),
-                                     manifolds,
-                                     island->manifoldArray.size(),
-                                     constraintsPtr,
-                                     island->constraintArray.size(),
-                                     island->id
-                                     );
-        }
-    }
-};
-
-void parallelIslandDispatch( btAlignedObjectArray<btSimulationIslandManagerMt::Island*>* islandsPtr, btSimulationIslandManagerMt::IslandCallback* callback )
-{
-    ProfileHelper prof(Profiler::kRecordDispatchIslands);
+    ProfileHelper prof( Profiler::kRecordDispatchIslands );
    gNumIslands = islandsPtr->size();
-    int grainSize = 1;  // iterations per task
-    UpdateIslandDispatcher dispatcher;
-    dispatcher.islandsPtr = islandsPtr;
-    dispatcher.callback = callback;
-    btPushThreadsAreRunning();
-    parallelFor( 0, islandsPtr->size(), grainSize, dispatcher );
-    btPopThreadsAreRunning();
-}
-#endif //#if USE_PARALLEL_ISLAND_SOLVER
-
-
-void profileBeginCallback(btDynamicsWorld *world, btScalar timeStep)
-{
-    gProfiler.begin(Profiler::kRecordInternalTimeStep);
+    btSimulationIslandManagerMt::parallelIslandDispatch( islandsPtr, callback );
 }

-void profileEndCallback(btDynamicsWorld *world, btScalar timeStep)
-{
-    gProfiler.end(Profiler::kRecordInternalTimeStep);
-}

 ///
-/// MyDiscreteDynamicsWorld
-///
-///  Should function exactly like btDiscreteDynamicsWorld.
-///  3 methods that iterate over all of the rigidbodies can run in parallel:
-///     - predictUnconstraintMotion
-///     - integrateTransforms
-///     - createPredictiveContacts
+/// MyDiscreteDynamicsWorld -- subclassed for profiling purposes
 ///
 ATTRIBUTE_ALIGNED16( class ) MyDiscreteDynamicsWorld : public btDiscreteDynamicsWorldMt
 {
    typedef btDiscreteDynamicsWorld ParentClass;

 protected:
-#if USE_PARALLEL_PREDICT_UNCONSTRAINED_MOTION
-    struct UpdaterUnconstrainedMotion
-    {
-        btScalar timeStep;
-        btRigidBody** rigidBodies;
-
-        void forLoop( int iBegin, int iEnd ) const
-        {
-            for ( int i = iBegin; i < iEnd; ++i )
-            {
-                btRigidBody* body = rigidBodies[ i ];
-                if ( !body->isStaticOrKinematicObject() )
-                {
-                    //don't integrate/update velocities here, it happens in the constraint solver
-                    body->applyDamping( timeStep );
-                    body->predictIntegratedTransform( timeStep, body->getInterpolationWorldTransform() );
-                }
-            }
-        }
-    };

    virtual void predictUnconstraintMotion( btScalar timeStep ) BT_OVERRIDE
    {
        ProfileHelper prof( Profiler::kRecordPredictUnconstrainedMotion );
-        BT_PROFILE( "predictUnconstraintMotion" );
-        int grainSize = 50;  // num of iterations per task for TBB
-        int bodyCount = m_nonStaticRigidBodies.size();
-        UpdaterUnconstrainedMotion update;
-        update.timeStep = timeStep;
-        update.rigidBodies = bodyCount ? &m_nonStaticRigidBodies[ 0 ] : NULL;
-        btPushThreadsAreRunning();
-        parallelFor( 0, bodyCount, grainSize, update );
-        btPopThreadsAreRunning();
+        ParentClass::predictUnconstraintMotion( timeStep );
    }
-#endif // #if USE_PARALLEL_PREDICT_UNCONSTRAINED_MOTION
-
-#if USE_PARALLEL_CREATE_PREDICTIVE_CONTACTS
-    struct UpdaterCreatePredictiveContacts
-    {
-        btScalar timeStep;
-        btRigidBody** rigidBodies;
-        MyDiscreteDynamicsWorld* world;
-
-        void forLoop( int iBegin, int iEnd ) const
-        {
-            world->createPredictiveContactsInternal( &rigidBodies[ iBegin ], iEnd - iBegin, timeStep );
-        }
-    };
-
-    virtual void createPredictiveContacts( btScalar timeStep )
+    virtual void createPredictiveContacts( btScalar timeStep ) BT_OVERRIDE
    {
        ProfileHelper prof( Profiler::kRecordCreatePredictiveContacts );
-        releasePredictiveContacts();
-        int grainSize = 50;  // num of iterations per task for TBB or OPENMP
-        if ( int bodyCount = m_nonStaticRigidBodies.size() )
-        {
-            UpdaterCreatePredictiveContacts update;
-            update.world = this;
-            update.timeStep = timeStep;
-            update.rigidBodies = &m_nonStaticRigidBodies[ 0 ];
-            btPushThreadsAreRunning();
-            parallelFor( 0, bodyCount, grainSize, update );
-            btPopThreadsAreRunning();
-        }
+        ParentClass::createPredictiveContacts( timeStep );
    }
-#endif // #if USE_PARALLEL_CREATE_PREDICTIVE_CONTACTS
-
-#if USE_PARALLEL_INTEGRATE_TRANSFORMS
-    struct UpdaterIntegrateTransforms
-    {
-        btScalar timeStep;
-        btRigidBody** rigidBodies;
-        MyDiscreteDynamicsWorld* world;
-
-        void forLoop( int iBegin, int iEnd ) const
-        {
-            world->integrateTransformsInternal( &rigidBodies[ iBegin ], iEnd - iBegin, timeStep );
-        }
-    };
-
    virtual void integrateTransforms( btScalar timeStep ) BT_OVERRIDE
    {
        ProfileHelper prof( Profiler::kRecordIntegrateTransforms );
-        BT_PROFILE( "integrateTransforms" );
-        int grainSize = 50;  // num of iterations per task for TBB or OPENMP
-        if ( int bodyCount = m_nonStaticRigidBodies.size() )
-        {
-            UpdaterIntegrateTransforms update;
-            update.world = this;
-            update.timeStep = timeStep;
-            update.rigidBodies = &m_nonStaticRigidBodies[ 0 ];
-            btPushThreadsAreRunning();
-            parallelFor( 0, bodyCount, grainSize, update );
-            btPopThreadsAreRunning();
-        }
+        ParentClass::integrateTransforms( timeStep );
    }
-#endif // #if USE_PARALLEL_INTEGRATE_TRANSFORMS

 public:
    BT_DECLARE_ALIGNED_ALLOCATOR();

    MyDiscreteDynamicsWorld( btDispatcher* dispatcher,
                             btBroadphaseInterface* pairCache,
-                             btConstraintSolver* constraintSolver,
+                             btConstraintSolverPoolMt* constraintSolver,
                             btCollisionConfiguration* collisionConfiguration
                             ) :
                             btDiscreteDynamicsWorldMt( dispatcher, pairCache, constraintSolver, collisionConfiguration )
    {
-#if USE_PARALLEL_ISLAND_SOLVER
        btSimulationIslandManagerMt* islandMgr = static_cast<btSimulationIslandManagerMt*>( m_islandManager );
-        islandMgr->setIslandDispatchFunction( parallelIslandDispatch );
-#endif //#if USE_PARALLEL_ISLAND_SOLVER
+        islandMgr->setIslandDispatchFunction( myParallelIslandDispatch );
    }

 };
@@ -625,6 +238,47 @@ btConstraintSolver* createSolverByType( SolverType t )
 }


+///
+/// btTaskSchedulerManager -- manage a number of task schedulers so we can switch between them
+///
+class btTaskSchedulerManager
+{
+    btAlignedObjectArray<btITaskScheduler*> m_taskSchedulers;
+
+public:
+    btTaskSchedulerManager() {}
+    void init()
+    {
+        addTaskScheduler( btGetSequentialTaskScheduler() );
+        addTaskScheduler( btGetOpenMPTaskScheduler() );
+        addTaskScheduler( btGetTBBTaskScheduler() );
+        addTaskScheduler( btGetPPLTaskScheduler() );
+        if ( getNumTaskSchedulers() > 1 )
+        {
+            // prefer a non-sequential scheduler if available
+            btSetTaskScheduler( m_taskSchedulers[ 1 ] );
+        }
+        else
+        {
+            btSetTaskScheduler( m_taskSchedulers[ 0 ] );
+        }
+        btGetTaskScheduler()->setNumThreads( btGetTaskScheduler()->getMaxNumThreads() );
+    }
+
+    void addTaskScheduler( btITaskScheduler* ts )
+    {
+        if ( ts )
+        {
+            m_taskSchedulers.push_back( ts );
+        }
+    }
+    int getNumTaskSchedulers() const { return m_taskSchedulers.size(); }
+    btITaskScheduler* getTaskScheduler( int i ) { return m_taskSchedulers[ i ]; }
+};
+
+
+static btTaskSchedulerManager gTaskSchedulerMgr;
+
 static bool gMultithreadedWorld = false;
 static bool gDisplayProfileInfo = false;
 static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE;
@@ -652,15 +306,17 @@ CommonRigidBodyMTBase::CommonRigidBodyMTBase( struct GUIHelperInterface* helper
 {
    m_multithreadedWorld = false;
    m_multithreadCapable = false;
-    gTaskMgr.init();
+    if ( gTaskSchedulerMgr.getNumTaskSchedulers() == 0 )
+    {
+        gTaskSchedulerMgr.init();
+    }
 }

 CommonRigidBodyMTBase::~CommonRigidBodyMTBase()
 {
-    gTaskMgr.shutdown();
 }

-void boolPtrButtonCallback(int buttonId, bool buttonState, void* userPointer)
+static void boolPtrButtonCallback(int buttonId, bool buttonState, void* userPointer)
 {
    if (bool* val = static_cast<bool*>(userPointer))
    {
@@ -668,7 +324,7 @@ void boolPtrButtonCallback(int buttonId, bool buttonState, void* userPointer)
    }
 }

-void toggleSolverModeCallback(int buttonId, bool buttonState, void* userPointer)
+static void toggleSolverModeCallback(int buttonId, bool buttonState, void* userPointer)
 {
    if (buttonState)
    {
@@ -687,7 +343,7 @@ void toggleSolverModeCallback(int buttonId, bool buttonState, void* userPointer)
    }
 }

-void setSolverTypeCallback(int buttonId, bool buttonState, void* userPointer)
+static void setSolverTypeCallback(int buttonId, bool buttonState, void* userPointer)
 {
    if (buttonId >= 0 && buttonId < SOLVER_TYPE_COUNT)
    {
@@ -695,32 +351,30 @@ void setSolverTypeCallback(int buttonId, bool buttonState, void* userPointer)
    }
 }

-void apiSelectButtonCallback(int buttonId, bool buttonState, void* userPointer)
+static void setNumThreads( int numThreads )
 {
-    gTaskMgr.setApi(static_cast<TaskManager::Api>(buttonId));
-    if (gTaskMgr.getApi()==TaskManager::apiNone)
+    int newNumThreads = ( std::min )( numThreads, int( BT_MAX_THREAD_COUNT ) );
+    int oldNumThreads = btGetTaskScheduler()->getNumThreads();
+    // only call when the thread count is different
+    if ( newNumThreads != oldNumThreads )
    {
-        gSliderNumThreads = 1.0f;
-    }
-    else
-    {
-        gSliderNumThreads = float(gTaskMgr.getNumThreads());
+        btGetTaskScheduler()->setNumThreads( newNumThreads );
    }
 }

-void setThreadCountCallback(float val, void* userPtr)
+static void apiSelectButtonCallback(int buttonId, bool buttonState, void* userPointer)
 {
-    if (gTaskMgr.getApi()==TaskManager::apiNone)
-    {
-        gSliderNumThreads = 1.0f;
-    }
-    else
-    {
-        gTaskMgr.setNumThreads( int( gSliderNumThreads ) );
-    }
+    // change the task scheduler
+    btSetTaskScheduler( gTaskSchedulerMgr.getTaskScheduler( buttonId ) );
+    setNumThreads( int( gSliderNumThreads ) );
 }

-void setSolverIterationCountCallback(float val, void* userPtr)
+static void setThreadCountCallback(float val, void* userPtr)
+{
+    setNumThreads( int( gSliderNumThreads ) );
+}
+
+static void setSolverIterationCountCallback(float val, void* userPtr)
 {
    if (btDiscreteDynamicsWorld* world = reinterpret_cast<btDiscreteDynamicsWorld*>(userPtr))
    {
@@ -733,6 +387,7 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
    gNumIslands = 0;
    m_solverType = gSolverType;
 #if BT_THREADSAFE && (BT_USE_OPENMP || BT_USE_PPL || BT_USE_TBB)
+    btAssert( btGetTaskScheduler() != NULL );
    m_multithreadCapable = true;
 #endif
    if ( gMultithreadedWorld )
@@ -743,30 +398,24 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
        cci.m_defaultMaxCollisionAlgorithmPoolSize = 80000;
        m_collisionConfiguration = new btDefaultCollisionConfiguration( cci );

-#if USE_PARALLEL_NARROWPHASE
-        m_dispatcher = new	MyCollisionDispatcher( m_collisionConfiguration );
-#else
-        m_dispatcher = new	btCollisionDispatcher( m_collisionConfiguration );
-#endif //USE_PARALLEL_NARROWPHASE
-
+        m_dispatcher = new MyCollisionDispatcher( m_collisionConfiguration, 40 );
        m_broadphase = new btDbvtBroadphase();

-#if BT_THREADSAFE && USE_PARALLEL_ISLAND_SOLVER
+        btConstraintSolverPoolMt* solverPool;
        {
            btConstraintSolver* solvers[ BT_MAX_THREAD_COUNT ];
-            int maxThreadCount = btMin( int(BT_MAX_THREAD_COUNT), TaskManager::getMaxNumThreads() );
+            int maxThreadCount = BT_MAX_THREAD_COUNT;
            for ( int i = 0; i < maxThreadCount; ++i )
            {
                solvers[ i ] = createSolverByType( m_solverType );
            }
-            m_solver = new MyConstraintSolverPool( solvers, maxThreadCount );
+            solverPool = new btConstraintSolverPoolMt( solvers, maxThreadCount );
+            m_solver = solverPool;
        }
-#else
-        m_solver = createSolverByType( m_solverType );
-#endif //#if USE_PARALLEL_ISLAND_SOLVER
-        btDiscreteDynamicsWorld* world = new MyDiscreteDynamicsWorld( m_dispatcher, m_broadphase, m_solver, m_collisionConfiguration );
+        btDiscreteDynamicsWorld* world = new MyDiscreteDynamicsWorld( m_dispatcher, m_broadphase, solverPool, m_collisionConfiguration );
        m_dynamicsWorld = world;
        m_multithreadedWorld = true;
+        btAssert( btGetTaskScheduler() != NULL );
    }
    else
    {
@@ -886,24 +535,25 @@ void CommonRigidBodyMTBase::createDefaultParameters()
    if (m_multithreadedWorld)
    {
        // create a button for each supported threading API
-        for (int iApi = 0; iApi < TaskManager::apiCount; ++iApi)
+        for ( int iApi = 0; iApi < gTaskSchedulerMgr.getNumTaskSchedulers(); ++iApi )
        {
-            TaskManager::Api api = static_cast<TaskManager::Api>(iApi);
-            if (gTaskMgr.isSupported(api))
-            {
-                char str[1024];
-                sprintf(str, "API %s", gTaskMgr.getApiName(api));
-                ButtonParams button( str, iApi, false );
-                button.m_callback = apiSelectButtonCallback;
-                m_guiHelper->getParameterInterface()->registerButtonParameter( button );
-            }
+            char str[ 1024 ];
+            sprintf( str, "API %s", gTaskSchedulerMgr.getTaskScheduler(iApi)->getName() );
+            ButtonParams button( str, iApi, false );
+            button.m_callback = apiSelectButtonCallback;
+            m_guiHelper->getParameterInterface()->registerButtonParameter( button );
        }
        {
            // create a slider to set the number of threads to use
-            gSliderNumThreads = float(gTaskMgr.getNumThreads());
+            int numThreads = btGetTaskScheduler()->getNumThreads();
+            // if slider has not been set yet (by another demo),
+            if ( gSliderNumThreads <= 1.0f )
+            {
+                gSliderNumThreads = float( numThreads );
+            }
 			SliderParams slider("Thread count", &gSliderNumThreads);
 			slider.m_minVal = 1.0f;
-			slider.m_maxVal = float(gTaskMgr.getMaxNumThreads()*2);
+			slider.m_maxVal = float( BT_MAX_THREAD_COUNT );
 			slider.m_callback = setThreadCountCallback;
            slider.m_clampToIntegers = true;
            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
@@ -946,14 +596,14 @@ void CommonRigidBodyMTBase::drawScreenText()
                const btPersistentManifold* man = m_dispatcher->getManifoldByIndexInternal( i );
                numContacts += man->getNumContacts();
            }
-            const char* mtApi = TaskManager::getApiName( gTaskMgr.getApi() );
+            const char* mtApi = btGetTaskScheduler()->getName();
            sprintf( msg, "islands=%d bodies=%d manifolds=%d contacts=%d [%s] threads=%d",
                     gNumIslands,
                     m_dynamicsWorld->getNumCollisionObjects(),
                     numManifolds,
                     numContacts,
                     mtApi,
-                     gTaskMgr.getApi() == TaskManager::apiNone ? 1 : gTaskMgr.getNumThreads()
+                     btGetTaskScheduler()->getNumThreads()
                     );
            m_guiHelper->getAppInterface()->drawText( msg, 100, yCoord, 0.4f );
            yCoord += yStep;
--- a/examples/MultiThreadedDemo/ParallelFor.h
+++ b/examples/MultiThreadedDemo/ParallelFor.h
@@ -1,336 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#include <stdio.h> //printf debugging
-#include <algorithm>
-
-
-// choose threading providers:
-#if BT_USE_TBB
-#define USE_TBB 1     // use Intel Threading Building Blocks for thread management
-#endif
-
-#if BT_USE_PPL
-#define USE_PPL 1     // use Microsoft Parallel Patterns Library (installed with Visual Studio 2010 and later)
-#endif // BT_USE_PPL
-
-#if BT_USE_OPENMP
-#define USE_OPENMP 1  // use OpenMP (also need to change compiler options for OpenMP support)
-#endif
-
-
-#if USE_OPENMP
-
-#include <omp.h>
-
-#endif // #if USE_OPENMP
-
-
-#if USE_PPL
-
-#include <ppl.h>  // if you get a compile error here, check whether your version of Visual Studio includes PPL
-// Visual Studio 2010 and later should come with it
-#include <concrtrm.h>  // for GetProcessorCount()
-#endif // #if USE_PPL
-
-
-#if USE_TBB
-
-#define __TBB_NO_IMPLICIT_LINKAGE 1
-#include <tbb/tbb.h>
-#include <tbb/task_scheduler_init.h>
-#include <tbb/parallel_for.h>
-#include <tbb/blocked_range.h>
-
-#endif // #if USE_TBB
-
-
-
-class TaskManager
-{
-public:
-    enum Api
-    {
-        apiNone,
-        apiOpenMP,
-        apiTbb,
-        apiPpl,
-        apiCount
-    };
-    static const char* getApiName( Api api )
-    {
-        switch ( api )
-        {
-        case apiNone: return "None";
-        case apiOpenMP: return "OpenMP";
-        case apiTbb: return "Intel TBB";
-        case apiPpl: return "MS PPL";
-        default: return "unknown";
-        }
-    }
-
-    TaskManager()
-    {
-        m_api = apiNone;
-        m_numThreads = 0;
-#if USE_TBB
-        m_tbbSchedulerInit = NULL;
-#endif // #if USE_TBB
-    }
-
-    Api getApi() const
-    {
-        return m_api;
-    }
-
-    bool isSupported( Api api ) const
-    {
-#if USE_OPENMP
-        if ( api == apiOpenMP )
-        {
-            return true;
-        }
-#endif
-#if USE_TBB
-        if ( api == apiTbb )
-        {
-            return true;
-        }
-#endif
-#if USE_PPL
-        if ( api == apiPpl )
-        {
-            return true;
-        }
-#endif
-        // apiNone is always "supported"
-        return api == apiNone;
-    }
-
-    void setApi( Api api )
-    {
-        if (isSupported(api))
-        {
-            m_api = api;
-        }
-        else
-        {
-            // no compile time support for selected API, fallback to "none"
-            m_api = apiNone;
-        }
-    }
-
-    static int getMaxNumThreads()
-    {
-#if USE_OPENMP
-        return omp_get_max_threads();
-#elif USE_PPL
-        return concurrency::GetProcessorCount();
-#elif USE_TBB
-        return tbb::task_scheduler_init::default_num_threads();
-#endif
-        return 1;
-    }
-
-    int getNumThreads() const
-    {
-        return m_numThreads;
-    }
-
-    int setNumThreads( int numThreads )
-    {
-        m_numThreads = ( std::max )( 1, numThreads );
-
-#if USE_OPENMP
-        omp_set_num_threads( m_numThreads );
-#endif
-
-#if USE_PPL
-        {
-            using namespace concurrency;
-            if ( CurrentScheduler::Id() != -1 )
-            {
-                CurrentScheduler::Detach();
-            }
-            SchedulerPolicy policy;
-            policy.SetConcurrencyLimits( m_numThreads, m_numThreads );
-            CurrentScheduler::Create( policy );
-        }
-#endif
-
-#if USE_TBB
-        if ( m_tbbSchedulerInit )
-        {
-            delete m_tbbSchedulerInit;
-            m_tbbSchedulerInit = NULL;
-        }
-        m_tbbSchedulerInit = new tbb::task_scheduler_init( m_numThreads );
-#endif
-        return m_numThreads;
-    }
-
-    void init()
-    {
-        if (m_numThreads == 0)
-        {
-#if USE_PPL
-            setApi( apiPpl );
-#endif
-#if USE_TBB
-            setApi( apiTbb );
-#endif
-#if USE_OPENMP
-            setApi( apiOpenMP );
-#endif
-            setNumThreads(getMaxNumThreads());
-        }
-        else
-        {
-            setNumThreads(m_numThreads);
-        }
-    }
-
-    void shutdown()
-    {
-#if USE_TBB
-        if ( m_tbbSchedulerInit )
-        {
-            delete m_tbbSchedulerInit;
-            m_tbbSchedulerInit = NULL;
-        }
-#endif
-    }
-
-private:
-    Api m_api;
-    int m_numThreads;
-#if USE_TBB
-    tbb::task_scheduler_init* m_tbbSchedulerInit;
-#endif // #if USE_TBB
-};
-
-extern TaskManager gTaskMgr;
-
-
-inline static void initTaskScheduler()
-{
-    gTaskMgr.init();
-}
-
-inline static void cleanupTaskScheduler()
-{
-    gTaskMgr.shutdown();
-}
-
-
-#if USE_TBB
-///
-/// TbbBodyAdapter -- Converts a body object that implements the
-///                   "forLoop(int iBegin, int iEnd) const" function
-///  into a TBB compatible object that takes a tbb::blocked_range<int> type.
-///
-template <class TBody>
-struct TbbBodyAdapter
-{
-    const TBody* mBody;
-
-    void operator()( const tbb::blocked_range<int>& range ) const
-    {
-        mBody->forLoop( range.begin(), range.end() );
-    }
-};
-#endif // #if USE_TBB
-
-#if USE_PPL
-///
-/// PplBodyAdapter -- Converts a body object that implements the
-///                   "forLoop(int iBegin, int iEnd) const" function
-///  into a PPL compatible object that implements "void operator()( int ) const"
-///
-template <class TBody>
-struct PplBodyAdapter
-{
-    const TBody* mBody;
-    int mGrainSize;
-    int mIndexEnd;
-
-    void operator()( int i ) const
-    {
-        mBody->forLoop( i, (std::min)(i + mGrainSize, mIndexEnd) );
-    }
-};
-#endif // #if USE_PPL
-
-
-///
-/// parallelFor -- interface for submitting work expressed as a for loop to the worker threads
-///
-template <class TBody>
-void parallelFor( int iBegin, int iEnd, int grainSize, const TBody& body )
-{
-#if USE_OPENMP
-    if ( gTaskMgr.getApi() == TaskManager::apiOpenMP )
-    {
-#pragma omp parallel for schedule(static, 1)
-        for ( int i = iBegin; i < iEnd; i += grainSize )
-        {
-            body.forLoop( i, (std::min)( i + grainSize, iEnd ) );
-        }
-        return;
-    }
-#endif // #if USE_OPENMP
-
-#if USE_PPL
-    if ( gTaskMgr.getApi() == TaskManager::apiPpl )
-    {
-        // PPL dispatch
-        PplBodyAdapter<TBody> pplBody;
-        pplBody.mBody = &body;
-        pplBody.mGrainSize = grainSize;
-        pplBody.mIndexEnd = iEnd;
-        // note: MSVC 2010 doesn't support partitioner args, so avoid them
-        concurrency::parallel_for( iBegin,
-                                   iEnd,
-                                   grainSize,
-                                   pplBody
-                                   );
-        return;
-    }
-#endif //#if USE_PPL
-
-#if USE_TBB
-    if ( gTaskMgr.getApi() == TaskManager::apiTbb )
-    {
-        // TBB dispatch
-        TbbBodyAdapter<TBody> tbbBody;
-        tbbBody.mBody = &body;
-        tbb::parallel_for( tbb::blocked_range<int>( iBegin, iEnd, grainSize ),
-                           tbbBody,
-                           tbb::simple_partitioner()
-                           );
-        return;
-    }
-#endif // #if USE_TBB
-
-    {
-        // run on main thread
-        body.forLoop( iBegin, iEnd );
-    }
-
-}
-
-
-
-