Bullet 2 threading refactor: moved parallel-for calls into core libs

2017-05-22 00:47:11 -07:00
parent 2f3844e5db
commit dfe184e8d3
14 changed files with 1012 additions and 847 deletions
--- a/src/BulletCollision/CMakeLists.txt
+++ b/src/BulletCollision/CMakeLists.txt
@@ -15,6 +15,7 @@ SET(BulletCollision_SRCS
 	CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp
 	CollisionDispatch/btBoxBoxDetector.cpp
 	CollisionDispatch/btCollisionDispatcher.cpp
+	CollisionDispatch/btCollisionDispatcherMt.cpp
 	CollisionDispatch/btCollisionObject.cpp
 	CollisionDispatch/btCollisionWorld.cpp
 	CollisionDispatch/btCollisionWorldImporter.cpp
@@ -123,6 +124,7 @@ SET(CollisionDispatch_HDRS
 	CollisionDispatch/btCollisionConfiguration.h
 	CollisionDispatch/btCollisionCreateFunc.h
 	CollisionDispatch/btCollisionDispatcher.h
+	CollisionDispatch/btCollisionDispatcherMt.h
 	CollisionDispatch/btCollisionObject.h
 	CollisionDispatch/btCollisionObjectWrapper.h
 	CollisionDispatch/btCollisionWorld.h
--- a/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.cpp
+++ b/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.cpp
@@ -0,0 +1,164 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#include "btCollisionDispatcherMt.h"
+#include "LinearMath/btQuickprof.h"
+
+#include "BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h"
+
+#include "BulletCollision/CollisionShapes/btCollisionShape.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
+#include "LinearMath/btPoolAllocator.h"
+#include "BulletCollision/CollisionDispatch/btCollisionConfiguration.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
+
+
+btCollisionDispatcherMt::btCollisionDispatcherMt( btCollisionConfiguration* config, int grainSize )
+    : btCollisionDispatcher( config )
+{
+    m_batchUpdating = false;
+    m_grainSize = grainSize;  // iterations per task
+}
+
+
+btPersistentManifold* btCollisionDispatcherMt::getNewManifold( const btCollisionObject* body0, const btCollisionObject* body1 )
+{
+    //optional relative contact breaking threshold, turned on by default (use setDispatcherFlags to switch off feature for improved performance)
+
+    btScalar contactBreakingThreshold = ( m_dispatcherFlags & btCollisionDispatcher::CD_USE_RELATIVE_CONTACT_BREAKING_THRESHOLD ) ?
+        btMin( body0->getCollisionShape()->getContactBreakingThreshold( gContactBreakingThreshold ), body1->getCollisionShape()->getContactBreakingThreshold( gContactBreakingThreshold ) )
+        : gContactBreakingThreshold;
+
+    btScalar contactProcessingThreshold = btMin( body0->getContactProcessingThreshold(), body1->getContactProcessingThreshold() );
+
+    void* mem = m_persistentManifoldPoolAllocator->allocate( sizeof( btPersistentManifold ) );
+    if ( NULL == mem )
+    {
+        //we got a pool memory overflow, by default we fallback to dynamically allocate memory. If we require a contiguous contact pool then assert.
+        if ( ( m_dispatcherFlags&CD_DISABLE_CONTACTPOOL_DYNAMIC_ALLOCATION ) == 0 )
+        {
+            mem = btAlignedAlloc( sizeof( btPersistentManifold ), 16 );
+        }
+        else
+        {
+            btAssert( 0 );
+            //make sure to increase the m_defaultMaxPersistentManifoldPoolSize in the btDefaultCollisionConstructionInfo/btDefaultCollisionConfiguration
+            return 0;
+        }
+    }
+    btPersistentManifold* manifold = new( mem ) btPersistentManifold( body0, body1, 0, contactBreakingThreshold, contactProcessingThreshold );
+    if ( !m_batchUpdating )
+    {
+        // batch updater will update manifold pointers array after finishing, so
+        // only need to update array when not batch-updating
+        btAssert( !btThreadsAreRunning() );
+        manifold->m_index1a = m_manifoldsPtr.size();
+        m_manifoldsPtr.push_back( manifold );
+    }
+
+    return manifold;
+}
+
+void btCollisionDispatcherMt::releaseManifold( btPersistentManifold* manifold )
+{
+    clearManifold( manifold );
+    btAssert( !btThreadsAreRunning() );
+    if ( !m_batchUpdating )
+    {
+        // batch updater will update manifold pointers array after finishing, so
+        // only need to update array when not batch-updating
+        int findIndex = manifold->m_index1a;
+        btAssert( findIndex < m_manifoldsPtr.size() );
+        m_manifoldsPtr.swap( findIndex, m_manifoldsPtr.size() - 1 );
+        m_manifoldsPtr[ findIndex ]->m_index1a = findIndex;
+        m_manifoldsPtr.pop_back();
+    }
+
+    manifold->~btPersistentManifold();
+    if ( m_persistentManifoldPoolAllocator->validPtr( manifold ) )
+    {
+        m_persistentManifoldPoolAllocator->freeMemory( manifold );
+    }
+    else
+    {
+        btAlignedFree( manifold );
+    }
+}
+
+struct CollisionDispatcherUpdater : public btIParallelForBody
+{
+    btBroadphasePair* mPairArray;
+    btNearCallback mCallback;
+    btCollisionDispatcher* mDispatcher;
+    const btDispatcherInfo* mInfo;
+
+    CollisionDispatcherUpdater()
+    {
+        mPairArray = NULL;
+        mCallback = NULL;
+        mDispatcher = NULL;
+        mInfo = NULL;
+    }
+    void forLoop( int iBegin, int iEnd ) const
+    {
+        for ( int i = iBegin; i < iEnd; ++i )
+        {
+            btBroadphasePair* pair = &mPairArray[ i ];
+            mCallback( *pair, *mDispatcher, *mInfo );
+        }
+    }
+};
+
+
+void btCollisionDispatcherMt::dispatchAllCollisionPairs( btOverlappingPairCache* pairCache, const btDispatcherInfo& info, btDispatcher* dispatcher )
+{
+    int pairCount = pairCache->getNumOverlappingPairs();
+    if ( pairCount == 0 )
+    {
+        return;
+    }
+    CollisionDispatcherUpdater updater;
+    updater.mCallback = getNearCallback();
+    updater.mPairArray = pairCache->getOverlappingPairArrayPtr();
+    updater.mDispatcher = this;
+    updater.mInfo = &info;
+
+    m_batchUpdating = true;
+    btParallelFor( 0, pairCount, m_grainSize, updater );
+    m_batchUpdating = false;
+
+    // reconstruct the manifolds array to ensure determinism
+    m_manifoldsPtr.resizeNoInitialize( 0 );
+
+    btBroadphasePair* pairs = pairCache->getOverlappingPairArrayPtr();
+    for ( int i = 0; i < pairCount; ++i )
+    {
+        if (btCollisionAlgorithm* algo = pairs[ i ].m_algorithm)
+        {
+            algo->getAllContactManifolds( m_manifoldsPtr );
+        }
+    }
+
+    // update the indices (used when releasing manifolds)
+    for ( int i = 0; i < m_manifoldsPtr.size(); ++i )
+    {
+        m_manifoldsPtr[ i ]->m_index1a = i;
+    }
+}
+
+
--- a/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h
+++ b/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h
@@ -0,0 +1,39 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_COLLISION_DISPATCHER_MT_H
+#define BT_COLLISION_DISPATCHER_MT_H
+
+#include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"
+#include "LinearMath/btThreads.h"
+
+
+class btCollisionDispatcherMt : public btCollisionDispatcher
+{
+public:
+    btCollisionDispatcherMt( btCollisionConfiguration* config, int grainSize = 40 );
+
+    virtual btPersistentManifold* getNewManifold( const btCollisionObject* body0, const btCollisionObject* body1 ) BT_OVERRIDE;
+    virtual void releaseManifold( btPersistentManifold* manifold ) BT_OVERRIDE;
+
+    virtual void dispatchAllCollisionPairs( btOverlappingPairCache* pairCache, const btDispatcherInfo& info, btDispatcher* dispatcher ) BT_OVERRIDE;
+
+protected:
+    bool m_batchUpdating;
+    int m_grainSize;
+};
+
+#endif //BT_COLLISION_DISPATCHER_MT_H
+
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
@@ -108,8 +108,105 @@ struct InplaceSolverIslandCallbackMt : public btSimulationIslandManagerMt::Islan
 };


+///
+/// btConstraintSolverPoolMt
+///

-btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,btBroadphaseInterface* pairCache,btConstraintSolver* constraintSolver, btCollisionConfiguration* collisionConfiguration)
+btConstraintSolverPoolMt::ThreadSolver* btConstraintSolverPoolMt::getAndLockThreadSolver()
+{
+    int i = btGetCurrentThreadIndex() % m_solvers.size();
+    while ( true )
+    {
+        ThreadSolver& solver = m_solvers[ i ];
+        if ( solver.mutex.tryLock() )
+        {
+            return &solver;
+        }
+        // failed, try the next one
+        i = ( i + 1 ) % m_solvers.size();
+    }
+    return NULL;
+}
+
+void btConstraintSolverPoolMt::init( btConstraintSolver** solvers, int numSolvers )
+{
+    m_solverType = BT_SEQUENTIAL_IMPULSE_SOLVER;
+    m_solvers.resize( numSolvers );
+    for ( int i = 0; i < numSolvers; ++i )
+    {
+        m_solvers[ i ].solver = solvers[ i ];
+    }
+    if ( numSolvers > 0 )
+    {
+        m_solverType = solvers[ 0 ]->getSolverType();
+    }
+}
+
+// create the solvers for me
+btConstraintSolverPoolMt::btConstraintSolverPoolMt( int numSolvers )
+{
+    btAlignedObjectArray<btConstraintSolver*> solvers;
+    solvers.reserve( numSolvers );
+    for ( int i = 0; i < numSolvers; ++i )
+    {
+        btConstraintSolver* solver = new btSequentialImpulseConstraintSolver();
+        solvers.push_back( solver );
+    }
+    init( &solvers[ 0 ], numSolvers );
+}
+
+// pass in fully constructed solvers (destructor will delete them)
+btConstraintSolverPoolMt::btConstraintSolverPoolMt( btConstraintSolver** solvers, int numSolvers )
+{
+    init( solvers, numSolvers );
+}
+
+btConstraintSolverPoolMt::~btConstraintSolverPoolMt()
+{
+    // delete all solvers
+    for ( int i = 0; i < m_solvers.size(); ++i )
+    {
+        ThreadSolver& solver = m_solvers[ i ];
+        delete solver.solver;
+        solver.solver = NULL;
+    }
+}
+
+///solve a group of constraints
+btScalar btConstraintSolverPoolMt::solveGroup( btCollisionObject** bodies,
+    int numBodies,
+    btPersistentManifold** manifolds,
+    int numManifolds,
+    btTypedConstraint** constraints,
+    int numConstraints,
+    const btContactSolverInfo& info,
+    btIDebugDraw* debugDrawer,
+    btDispatcher* dispatcher
+)
+{
+    ThreadSolver* ts = getAndLockThreadSolver();
+    ts->solver->solveGroup( bodies, numBodies, manifolds, numManifolds, constraints, numConstraints, info, debugDrawer, dispatcher );
+    ts->mutex.unlock();
+    return 0.0f;
+}
+
+void btConstraintSolverPoolMt::reset()
+{
+    for ( int i = 0; i < m_solvers.size(); ++i )
+    {
+        ThreadSolver& solver = m_solvers[ i ];
+        solver.mutex.lock();
+        solver.solver->reset();
+        solver.mutex.unlock();
+    }
+}
+
+
+///
+/// btDiscreteDynamicsWorldMt
+///
+
+btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher, btBroadphaseInterface* pairCache, btConstraintSolverPoolMt* constraintSolver, btCollisionConfiguration* collisionConfiguration)
 : btDiscreteDynamicsWorld(dispatcher,pairCache,constraintSolver,collisionConfiguration)
 {
 	if (m_ownsIslandManager)
@@ -124,8 +221,8 @@ btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,bt
 	{
 		void* mem = btAlignedAlloc(sizeof(btSimulationIslandManagerMt),16);
 		btSimulationIslandManagerMt* im = new (mem) btSimulationIslandManagerMt();
-        m_islandManager = im;
        im->setMinimumSolverBatchSize( m_solverInfo.m_minimumSolverBatchSize );
+        m_islandManager = im;
 	}
 }

@@ -145,7 +242,7 @@ btDiscreteDynamicsWorldMt::~btDiscreteDynamicsWorldMt()
 }


-void	btDiscreteDynamicsWorldMt::solveConstraints(btContactSolverInfo& solverInfo)
+void btDiscreteDynamicsWorldMt::solveConstraints(btContactSolverInfo& solverInfo)
 {
 	BT_PROFILE("solveConstraints");

@@ -160,3 +257,65 @@ void	btDiscreteDynamicsWorldMt::solveConstraints(btContactSolverInfo& solverInfo
 }


+struct UpdaterUnconstrainedMotion : public btIParallelForBody
+{
+    btScalar timeStep;
+    btRigidBody** rigidBodies;
+
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        for ( int i = iBegin; i < iEnd; ++i )
+        {
+            btRigidBody* body = rigidBodies[ i ];
+            if ( !body->isStaticOrKinematicObject() )
+            {
+                //don't integrate/update velocities here, it happens in the constraint solver
+                body->applyDamping( timeStep );
+                body->predictIntegratedTransform( timeStep, body->getInterpolationWorldTransform() );
+            }
+        }
+    }
+};
+
+
+void btDiscreteDynamicsWorldMt::predictUnconstraintMotion( btScalar timeStep )
+{
+    BT_PROFILE( "predictUnconstraintMotion" );
+    int grainSize = 50;  // num of iterations per task for TBB
+    int bodyCount = m_nonStaticRigidBodies.size();
+    UpdaterUnconstrainedMotion update;
+    update.timeStep = timeStep;
+    update.rigidBodies = bodyCount ? &m_nonStaticRigidBodies[ 0 ] : NULL;
+    btParallelFor( 0, bodyCount, grainSize, update );
+}
+
+
+void btDiscreteDynamicsWorldMt::createPredictiveContacts( btScalar timeStep )
+{
+    releasePredictiveContacts();
+    int grainSize = 50;  // num of iterations per task for TBB or OPENMP
+    if ( int bodyCount = m_nonStaticRigidBodies.size() )
+    {
+        UpdaterCreatePredictiveContacts update;
+        update.world = this;
+        update.timeStep = timeStep;
+        update.rigidBodies = &m_nonStaticRigidBodies[ 0 ];
+        btParallelFor( 0, bodyCount, grainSize, update );
+    }
+}
+
+
+void btDiscreteDynamicsWorldMt::integrateTransforms( btScalar timeStep )
+{
+    BT_PROFILE( "integrateTransforms" );
+    int grainSize = 50;  // num of iterations per task for TBB or OPENMP
+    if ( int bodyCount = m_nonStaticRigidBodies.size() )
+    {
+        UpdaterIntegrateTransforms update;
+        update.world = this;
+        update.timeStep = timeStep;
+        update.rigidBodies = &m_nonStaticRigidBodies[ 0 ];
+        btParallelFor( 0, bodyCount, grainSize, update );
+    }
+}
+
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
@@ -18,24 +18,116 @@ subject to the following restrictions:
 #define BT_DISCRETE_DYNAMICS_WORLD_MT_H

 #include "btDiscreteDynamicsWorld.h"
+#include "btSimulationIslandManagerMt.h"
+#include "BulletDynamics/ConstraintSolver/btConstraintSolver.h"

 struct InplaceSolverIslandCallbackMt;

+///
+/// btConstraintSolverPoolMt - masquerades as a constraint solver, but really it is a threadsafe pool of them.
+///
+///  Each solver in the pool is protected by a mutex.  When solveGroup is called from a thread,
+///  the pool looks for a solver that isn't being used by another thread, locks it, and dispatches the
+///  call to the solver.
+///  So long as there are at least as many solvers as there are hardware threads, it should never need to
+///  spin wait.
+///
+class btConstraintSolverPoolMt : public btConstraintSolver
+{
+public:
+    // create the solvers for me
+    explicit btConstraintSolverPoolMt( int numSolvers );
+
+    // pass in fully constructed solvers (destructor will delete them)
+    btConstraintSolverPoolMt( btConstraintSolver** solvers, int numSolvers );
+
+    virtual ~btConstraintSolverPoolMt();
+
+    ///solve a group of constraints
+    virtual btScalar solveGroup( btCollisionObject** bodies,
+        int numBodies,
+        btPersistentManifold** manifolds,
+        int numManifolds,
+        btTypedConstraint** constraints,
+        int numConstraints,
+        const btContactSolverInfo& info,
+        btIDebugDraw* debugDrawer,
+        btDispatcher* dispatcher
+    ) BT_OVERRIDE;
+
+    virtual	void reset() BT_OVERRIDE;
+    virtual btConstraintSolverType getSolverType() const BT_OVERRIDE { return m_solverType; }
+
+private:
+    const static size_t kCacheLineSize = 128;
+    struct ThreadSolver
+    {
+        btConstraintSolver* solver;
+        btSpinMutex mutex;
+        char _cachelinePadding[ kCacheLineSize - sizeof( btSpinMutex ) - sizeof( void* ) ];  // keep mutexes from sharing a cache line
+    };
+    btAlignedObjectArray<ThreadSolver> m_solvers;
+    btConstraintSolverType m_solverType;
+
+    ThreadSolver* getAndLockThreadSolver();
+    void init( btConstraintSolver** solvers, int numSolvers );
+};
+
+
+
 ///
 /// btDiscreteDynamicsWorldMt -- a version of DiscreteDynamicsWorld with some minor changes to support
 ///                              solving simulation islands on multiple threads.
 ///
+///  Should function exactly like btDiscreteDynamicsWorld.
+///  Also 3 methods that iterate over all of the rigidbodies can run in parallel:
+///     - predictUnconstraintMotion
+///     - integrateTransforms
+///     - createPredictiveContacts
+///
 ATTRIBUTE_ALIGNED16(class) btDiscreteDynamicsWorldMt : public btDiscreteDynamicsWorld
 {
 protected:
    InplaceSolverIslandCallbackMt* m_solverIslandCallbackMt;

-    virtual void	solveConstraints(btContactSolverInfo& solverInfo);
+    virtual void solveConstraints(btContactSolverInfo& solverInfo) BT_OVERRIDE;
+
+    virtual void predictUnconstraintMotion( btScalar timeStep ) BT_OVERRIDE;
+
+    struct UpdaterCreatePredictiveContacts : public btIParallelForBody
+    {
+        btScalar timeStep;
+        btRigidBody** rigidBodies;
+        btDiscreteDynamicsWorldMt* world;
+
+        void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+        {
+            world->createPredictiveContactsInternal( &rigidBodies[ iBegin ], iEnd - iBegin, timeStep );
+        }
+    };
+    virtual void createPredictiveContacts( btScalar timeStep ) BT_OVERRIDE;
+
+    struct UpdaterIntegrateTransforms : public btIParallelForBody
+    {
+        btScalar timeStep;
+        btRigidBody** rigidBodies;
+        btDiscreteDynamicsWorldMt* world;
+
+        void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+        {
+            world->integrateTransformsInternal( &rigidBodies[ iBegin ], iEnd - iBegin, timeStep );
+        }
+    };
+    virtual void integrateTransforms( btScalar timeStep ) BT_OVERRIDE;

 public:
 	BT_DECLARE_ALIGNED_ALLOCATOR();

-	btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,btBroadphaseInterface* pairCache,btConstraintSolver* constraintSolver,btCollisionConfiguration* collisionConfiguration);
+	btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,
+        btBroadphaseInterface* pairCache,
+        btConstraintSolverPoolMt* constraintSolver,   // Note this should be a solver-pool for multi-threading
+        btCollisionConfiguration* collisionConfiguration
+    );
 	virtual ~btDiscreteDynamicsWorldMt();
 };

--- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
+++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
@@ -15,6 +15,7 @@ subject to the following restrictions:


 #include "LinearMath/btScalar.h"
+#include "LinearMath/btThreads.h"
 #include "btSimulationIslandManagerMt.h"
 #include "BulletCollision/BroadphaseCollision/btDispatcher.h"
 #include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"
@@ -44,7 +45,7 @@ btSimulationIslandManagerMt::btSimulationIslandManagerMt()
 {
    m_minimumSolverBatchSize = calcBatchCost(0, 128, 0);
    m_batchIslandMinBodyCount = 32;
-    m_islandDispatch = defaultIslandDispatch;
+    m_islandDispatch = parallelIslandDispatch;
    m_batchIsland = NULL;
 }

@@ -545,7 +546,7 @@ void btSimulationIslandManagerMt::mergeIslands()
 }


-void btSimulationIslandManagerMt::defaultIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback )
+void btSimulationIslandManagerMt::serialIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback )
 {
    // serial dispatch
    btAlignedObjectArray<Island*>& islands = *islandsPtr;
@@ -565,6 +566,40 @@ void btSimulationIslandManagerMt::defaultIslandDispatch( btAlignedObjectArray<Is
    }
 }

+struct UpdateIslandDispatcher : public btIParallelForBody
+{
+    btAlignedObjectArray<btSimulationIslandManagerMt::Island*>* islandsPtr;
+    btSimulationIslandManagerMt::IslandCallback* callback;
+
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        for ( int i = iBegin; i < iEnd; ++i )
+        {
+            btSimulationIslandManagerMt::Island* island = ( *islandsPtr )[ i ];
+            btPersistentManifold** manifolds = island->manifoldArray.size() ? &island->manifoldArray[ 0 ] : NULL;
+            btTypedConstraint** constraintsPtr = island->constraintArray.size() ? &island->constraintArray[ 0 ] : NULL;
+            callback->processIsland( &island->bodyArray[ 0 ],
+                island->bodyArray.size(),
+                manifolds,
+                island->manifoldArray.size(),
+                constraintsPtr,
+                island->constraintArray.size(),
+                island->id
+            );
+        }
+    }
+};
+
+void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback )
+{
+    int grainSize = 1;  // iterations per task
+    UpdateIslandDispatcher dispatcher;
+    dispatcher.islandsPtr = islandsPtr;
+    dispatcher.callback = callback;
+    btParallelFor( 0, islandsPtr->size(), grainSize, dispatcher );
+}
+
+
 ///@todo: this is random access, it can be walked 'cache friendly'!
 void btSimulationIslandManagerMt::buildAndProcessIslands( btDispatcher* dispatcher,
                                                        btCollisionWorld* collisionWorld,
--- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
+++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
@@ -59,7 +59,8 @@ public:
                                    ) = 0;
    };
    typedef void( *IslandDispatchFunc ) ( btAlignedObjectArray<Island*>* islands, IslandCallback* callback );
-    static void defaultIslandDispatch( btAlignedObjectArray<Island*>* islands, IslandCallback* callback );
+    static void serialIslandDispatch( btAlignedObjectArray<Island*>* islands, IslandCallback* callback );
+    static void parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback );
 protected:
    btAlignedObjectArray<Island*> m_allocatedIslands;  // owner of all Islands
    btAlignedObjectArray<Island*> m_activeIslands;  // islands actively in use
--- a/src/LinearMath/btThreads.cpp
+++ b/src/LinearMath/btThreads.cpp
@@ -14,6 +14,247 @@ subject to the following restrictions:


 #include "btThreads.h"
+#include <algorithm>  // for min and max
+
+#if BT_THREADSAFE
+
+#if BT_USE_OPENMP
+
+#include <omp.h>
+
+#endif // #if BT_USE_OPENMP
+
+
+#if BT_USE_PPL
+
+// use Microsoft Parallel Patterns Library (installed with Visual Studio 2010 and later)
+#include <ppl.h>  // if you get a compile error here, check whether your version of Visual Studio includes PPL
+// Visual Studio 2010 and later should come with it
+#include <concrtrm.h>  // for GetProcessorCount()
+
+#endif // #if BT_USE_PPL
+
+
+#if BT_USE_TBB
+
+// use Intel Threading Building Blocks for thread management
+#define __TBB_NO_IMPLICIT_LINKAGE 1
+#include <tbb/tbb.h>
+#include <tbb/task_scheduler_init.h>
+#include <tbb/parallel_for.h>
+#include <tbb/blocked_range.h>
+
+#endif // #if BT_USE_TBB
+
+
+static btITaskScheduler* gBtTaskScheduler;
+static int gThreadsRunningCounter = 0;  // useful for detecting if we are trying to do nested parallel-for calls
+static btSpinMutex gThreadsRunningCounterMutex;
+
+void btPushThreadsAreRunning()
+{
+    gThreadsRunningCounterMutex.lock();
+    gThreadsRunningCounter++;
+    gThreadsRunningCounterMutex.unlock();
+}
+
+void btPopThreadsAreRunning()
+{
+    gThreadsRunningCounterMutex.lock();
+    gThreadsRunningCounter--;
+    gThreadsRunningCounterMutex.unlock();
+}
+
+bool btThreadsAreRunning()
+{
+    return gThreadsRunningCounter != 0;
+}
+
+
+void btSetTaskScheduler( btITaskScheduler* ts )
+{
+    gBtTaskScheduler = ts;
+}
+
+btITaskScheduler* btGetTaskScheduler()
+{
+    return gBtTaskScheduler;
+}
+
+void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body )
+{
+    gBtTaskScheduler->parallelFor( iBegin, iEnd, grainSize, body );
+}
+
+
+#if BT_USE_OPENMP
+///
+/// btTaskSchedulerOpenMP -- OpenMP task scheduler implementation
+///
+class btTaskSchedulerOpenMP : public btITaskScheduler
+{
+    int m_numThreads;
+public:
+    btTaskSchedulerOpenMP() : btITaskScheduler( "OpenMP" )
+    {
+        m_numThreads = 0;
+    }
+    virtual int getMaxNumThreads() const BT_OVERRIDE
+    {
+        return omp_get_max_threads();
+    }
+    virtual int getNumThreads() const BT_OVERRIDE
+    {
+        return m_numThreads;
+    }
+    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
+    {
+        m_numThreads = ( std::max )( 1, numThreads );
+        omp_set_num_threads( m_numThreads );
+    }
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
+    {
+        btPushThreadsAreRunning();
+#pragma omp parallel for schedule( static, 1 )
+        for ( int i = iBegin; i < iEnd; i += grainSize )
+        {
+            body.forLoop( i, ( std::min )( i + grainSize, iEnd ) );
+        }
+        btPopThreadsAreRunning();
+    }
+};
+#endif // #if BT_USE_OPENMP
+
+
+#if BT_USE_TBB
+///
+/// btTaskSchedulerTBB -- task scheduler implemented via Intel Threaded Building Blocks
+///
+class btTaskSchedulerTBB : public btITaskScheduler
+{
+    int m_numThreads;
+    tbb::task_scheduler_init* m_tbbSchedulerInit;
+
+public:
+    btTaskSchedulerTBB() : btITaskScheduler( "IntelTBB" )
+    {
+        m_numThreads = 0;
+        m_tbbSchedulerInit = NULL;
+    }
+    ~btTaskSchedulerTBB()
+    {
+        if ( m_tbbSchedulerInit )
+        {
+            delete m_tbbSchedulerInit;
+            m_tbbSchedulerInit = NULL;
+        }
+    }
+
+    virtual int getMaxNumThreads() const BT_OVERRIDE
+    {
+        return tbb::task_scheduler_init::default_num_threads();
+    }
+    virtual int getNumThreads() const BT_OVERRIDE
+    {
+        return m_numThreads;
+    }
+    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
+    {
+        m_numThreads = ( std::max )( 1, numThreads );
+        if ( m_tbbSchedulerInit )
+        {
+            delete m_tbbSchedulerInit;
+            m_tbbSchedulerInit = NULL;
+        }
+        m_tbbSchedulerInit = new tbb::task_scheduler_init( m_numThreads );
+    }
+    struct BodyAdapter
+    {
+        const btIParallelForBody* mBody;
+
+        void operator()( const tbb::blocked_range<int>& range ) const
+        {
+            mBody->forLoop( range.begin(), range.end() );
+        }
+    };
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
+    {
+        // TBB dispatch
+        BodyAdapter tbbBody;
+        tbbBody.mBody = &body;
+        btPushThreadsAreRunning();
+        tbb::parallel_for( tbb::blocked_range<int>( iBegin, iEnd, grainSize ),
+            tbbBody,
+            tbb::simple_partitioner()
+        );
+        btPopThreadsAreRunning();
+    }
+};
+#endif // #if BT_USE_TBB
+
+#if BT_USE_PPL
+///
+/// btTaskSchedulerPPL -- task scheduler implemented via Microsoft Parallel Patterns Lib
+///
+class btTaskSchedulerPPL : public btITaskScheduler
+{
+    int m_numThreads;
+public:
+    btTaskSchedulerPPL() : btITaskScheduler( "PPL" )
+    {
+        m_numThreads = 0;
+    }
+    virtual int getMaxNumThreads() const BT_OVERRIDE
+    {
+        return concurrency::GetProcessorCount();
+    }
+    virtual int getNumThreads() const BT_OVERRIDE
+    {
+        return m_numThreads;
+    }
+    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
+    {
+        m_numThreads = ( std::max )( 1, numThreads );
+        using namespace concurrency;
+        if ( CurrentScheduler::Id() != -1 )
+        {
+            CurrentScheduler::Detach();
+        }
+        SchedulerPolicy policy;
+        policy.SetConcurrencyLimits( m_numThreads, m_numThreads );
+        CurrentScheduler::Create( policy );
+    }
+    struct BodyAdapter
+    {
+        const btIParallelForBody* mBody;
+        int mGrainSize;
+        int mIndexEnd;
+
+        void operator()( int i ) const
+        {
+            mBody->forLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) );
+        }
+    };
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
+    {
+        // PPL dispatch
+        BodyAdapter pplBody;
+        pplBody.mBody = &body;
+        pplBody.mGrainSize = grainSize;
+        pplBody.mIndexEnd = iEnd;
+        btPushThreadsAreRunning();
+        // note: MSVC 2010 doesn't support partitioner args, so avoid them
+        concurrency::parallel_for( iBegin,
+            iEnd,
+            grainSize,
+            pplBody
+        );
+        btPopThreadsAreRunning();
+    }
+};
+#endif // #if BT_USE_PPL
+
+

 //
 // Lightweight spin-mutex based on atomics
@@ -22,8 +263,6 @@ subject to the following restrictions:
 // context switching.
 // 

-#if BT_THREADSAFE
-
 #if __cplusplus >= 201103L

 // for anything claiming full C++11 compliance, use C++11 atomics
@@ -229,3 +468,64 @@ bool btSpinMutex::tryLock()

 #endif // #if BT_THREADSAFE

+
+///
+/// btTaskSchedulerSequential -- non-threaded implementation of task scheduler
+///                              (fallback in case no multi-threaded schedulers are available)
+///
+class btTaskSchedulerSequential : public btITaskScheduler
+{
+public:
+    btTaskSchedulerSequential() : btITaskScheduler( "Sequential" ) {}
+    virtual int getMaxNumThreads() const BT_OVERRIDE { return 1; }
+    virtual int getNumThreads() const BT_OVERRIDE { return 1; }
+    virtual void setNumThreads( int numThreads ) BT_OVERRIDE {}
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
+    {
+        body.forLoop( iBegin, iEnd );
+    }
+};
+
+// create a non-threaded task scheduler (always available)
+btITaskScheduler* btGetSequentialTaskScheduler()
+{
+    static btTaskSchedulerSequential sTaskScheduler;
+    return &sTaskScheduler;
+}
+
+
+// create an OpenMP task scheduler (if available, otherwise returns null)
+btITaskScheduler* btGetOpenMPTaskScheduler()
+{
+#if BT_USE_OPENMP && BT_THREADSAFE
+    static btTaskSchedulerOpenMP sTaskScheduler;
+    return &sTaskScheduler;
+#else
+    return NULL;
+#endif
+}
+
+
+// create an Intel TBB task scheduler (if available, otherwise returns null)
+btITaskScheduler* btGetTBBTaskScheduler()
+{
+#if BT_USE_TBB && BT_THREADSAFE
+    static btTaskSchedulerTBB sTaskScheduler;
+    return &sTaskScheduler;
+#else
+    return NULL;
+#endif
+}
+
+
+// create a PPL task scheduler (if available, otherwise returns null)
+btITaskScheduler* btGetPPLTaskScheduler()
+{
+#if BT_USE_PPL && BT_THREADSAFE
+    static btTaskSchedulerPPL sTaskScheduler;
+    return &sTaskScheduler;
+#else
+    return NULL;
+#endif
+}
+
--- a/src/LinearMath/btThreads.h
+++ b/src/LinearMath/btThreads.h
@@ -19,6 +19,15 @@ subject to the following restrictions:

 #include "btScalar.h" // has definitions like SIMD_FORCE_INLINE

+#if defined (_MSC_VER) && _MSC_VER >= 1600
+// give us a compile error if any signatures of overriden methods is changed
+#define BT_OVERRIDE override
+#endif
+
+#ifndef BT_OVERRIDE
+#define BT_OVERRIDE
+#endif
+
 ///
 /// btSpinMutex -- lightweight spin-mutex implemented with atomic ops, never puts
 ///               a thread to sleep because it is designed to be used with a task scheduler
@@ -59,6 +68,7 @@ SIMD_FORCE_INLINE bool btMutexTryLock( btSpinMutex* mutex )

 // for internal use only
 bool btIsMainThread();
+bool btThreadsAreRunning();
 unsigned int btGetCurrentThreadIndex();
 const unsigned int BT_MAX_THREAD_COUNT = 64;

@@ -71,5 +81,55 @@ SIMD_FORCE_INLINE void btMutexUnlock( btSpinMutex* ) {}
 SIMD_FORCE_INLINE bool btMutexTryLock( btSpinMutex* ) {return true;}
 #endif

+//
+// btIParallelForBody -- subclass this to express work that can be done in parallel
+//
+class btIParallelForBody
+{
+public:
+    virtual void forLoop( int iBegin, int iEnd ) const = 0;
+};
+
+//
+// btITaskScheduler -- subclass this to implement a task scheduler that can dispatch work to
+//                     worker threads
+//
+class btITaskScheduler
+{
+    const char* m_name;
+public:
+    btITaskScheduler( const char* name ) : m_name( name ) {}
+    const char* getName() const { return m_name; }
+
+    virtual ~btITaskScheduler() {}
+    virtual int getMaxNumThreads() const = 0;
+    virtual int getNumThreads() const = 0;
+    virtual void setNumThreads( int numThreads ) = 0;
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) = 0;
+};
+
+// set the task scheduler to use for all calls to btParallelFor()
+// NOTE: you must set this prior to using any of the multi-threaded "Mt" classes
+void btSetTaskScheduler( btITaskScheduler* ts );
+
+// get the current task scheduler
+btITaskScheduler* btGetTaskScheduler();
+
+// get non-threaded task scheduler (always available)
+btITaskScheduler* btGetSequentialTaskScheduler();
+
+// get OpenMP task scheduler (if available, otherwise returns null)
+btITaskScheduler* btGetOpenMPTaskScheduler();
+
+// get Intel TBB task scheduler (if available, otherwise returns null)
+btITaskScheduler* btGetTBBTaskScheduler();
+
+// get PPL task scheduler (if available, otherwise returns null)
+btITaskScheduler* btGetPPLTaskScheduler();
+
+// btParallelFor -- call this to dispatch work like a for-loop
+//                 (iterations may be done out of order, so no dependencies are allowed)
+void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body );
+

 #endif //BT_THREADS_H