From dfe184e8d3bcb7a81f3372c91388895c7e9218cf Mon Sep 17 00:00:00 2001
From: Lunkhound <lunkhound@gmail.com>
Date: Mon, 22 May 2017 00:47:11 -0700
Subject: [PATCH] Bullet 2 threading refactor: moved parallel-for calls into
 core libs

---
 CMakeLists.txt                                |  42 +-
 examples/Benchmarks/BenchmarkDemo.cpp         |   5 +-
 examples/ExampleBrowser/CMakeLists.txt        |  28 +-
 .../CommonRigidBodyMTBase.cpp                 | 576 ++++--------------
 examples/MultiThreadedDemo/ParallelFor.h      | 336 ----------
 src/BulletCollision/CMakeLists.txt            |   2 +
 .../btCollisionDispatcherMt.cpp               | 164 +++++
 .../btCollisionDispatcherMt.h                 |  39 ++
 .../Dynamics/btDiscreteDynamicsWorldMt.cpp    | 165 ++++-
 .../Dynamics/btDiscreteDynamicsWorldMt.h      |  96 ++-
 .../Dynamics/btSimulationIslandManagerMt.cpp  |  39 +-
 .../Dynamics/btSimulationIslandManagerMt.h    |   3 +-
 src/LinearMath/btThreads.cpp                  | 304 ++++++++-
 src/LinearMath/btThreads.h                    |  60 ++
 14 files changed, 1012 insertions(+), 847 deletions(-)
 delete mode 100644 examples/MultiThreadedDemo/ParallelFor.h
 create mode 100644 src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.cpp
 create mode 100644 src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6ad1a9706..65859e663 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,15 @@ OPTION(USE_DOUBLE_PRECISION "Use double precision"	OFF)
 OPTION(USE_GRAPHICAL_BENCHMARK "Use Graphical Benchmark" ON)
 OPTION(BUILD_SHARED_LIBS "Use shared libraries" OFF)
 OPTION(USE_SOFT_BODY_MULTI_BODY_DYNAMICS_WORLD "Use btSoftMultiBodyDynamicsWorld" OFF)
-OPTION(BULLET2_USE_THREAD_LOCKS "Build Bullet 2 libraries with mutex locking around certain operations" OFF)
+
+OPTION(BULLET2_USE_THREAD_LOCKS "Build Bullet 2 libraries with mutex locking around certain operations (required for multi-threading)" OFF)
+IF (BULLET2_USE_THREAD_LOCKS)
+    OPTION(BULLET2_USE_OPEN_MP_MULTITHREADING "Build Bullet 2 with support for multi-threading with OpenMP (requires a compiler with OpenMP support)" OFF)
+    OPTION(BULLET2_USE_TBB_MULTITHREADING "Build Bullet 2 with support for multi-threading with Intel Threading Building Blocks (requires the TBB library to be already installed)" OFF)
+    IF (MSVC)
+        OPTION(BULLET2_USE_PPL_MULTITHREADING "Build Bullet 2 with support for multi-threading with Microsoft Parallel Patterns Library (requires MSVC compiler)" OFF)
+    ENDIF (MSVC)
+ENDIF (BULLET2_USE_THREAD_LOCKS)
 
 OPTION(USE_MSVC_INCREMENTAL_LINKING "Use MSVC Incremental Linking" OFF)
 OPTION(USE_CUSTOM_VECTOR_MATH "Use custom vectormath library" OFF)
@@ -208,6 +216,30 @@ IF(BULLET2_USE_THREAD_LOCKS)
 	ENDIF (NOT MSVC)
 ENDIF (BULLET2_USE_THREAD_LOCKS)
 
+IF (BULLET2_USE_OPEN_MP_MULTITHREADING)
+    ADD_DEFINITIONS("-DBT_USE_OPENMP=1")
+    IF (MSVC)
+        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
+    ELSE (MSVC)
+        # GCC, Clang
+        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+    ENDIF (MSVC)
+ENDIF (BULLET2_USE_OPEN_MP_MULTITHREADING)
+
+IF (BULLET2_USE_TBB_MULTITHREADING)
+    SET (BULLET2_TBB_INCLUDE_DIR "not found" CACHE PATH "Directory for Intel TBB includes.")
+    SET (BULLET2_TBB_LIB_DIR "not found" CACHE PATH "Directory for Intel TBB libraries.")
+    find_library(TBB_LIBRARY tbb PATHS ${BULLET2_TBB_LIB_DIR})
+    find_library(TBBMALLOC_LIBRARY tbbmalloc PATHS ${BULLET2_TBB_LIB_DIR})
+    ADD_DEFINITIONS("-DBT_USE_TBB=1")
+    INCLUDE_DIRECTORIES( ${BULLET2_TBB_INCLUDE_DIR} )
+    LINK_LIBRARIES( ${TBB_LIBRARY} ${TBBMALLOC_LIBRARY} )
+ENDIF (BULLET2_USE_TBB_MULTITHREADING)
+
+IF (BULLET2_USE_PPL_MULTITHREADING)
+    ADD_DEFINITIONS("-DBT_USE_PPL=1")
+ENDIF (BULLET2_USE_PPL_MULTITHREADING)
+
 IF (WIN32)
 OPTION(USE_GLUT "Use Glut"	ON)
 ADD_DEFINITIONS( -D_CRT_SECURE_NO_WARNINGS )
@@ -350,14 +382,6 @@ IF(BUILD_BULLET2_DEMOS)
 		SUBDIRS(examples)
 	ENDIF()
 
-        IF (BULLET2_USE_THREAD_LOCKS)
-            OPTION(BULLET2_MULTITHREADED_OPEN_MP_DEMO "Build Bullet 2 MultithreadedDemo using OpenMP (requires a compiler with OpenMP support)" OFF)
-            OPTION(BULLET2_MULTITHREADED_TBB_DEMO "Build Bullet 2 MultithreadedDemo using Intel Threading Building Blocks (requires the TBB library to be already installed)" OFF)
-            IF (MSVC)
-                OPTION(BULLET2_MULTITHREADED_PPL_DEMO "Build Bullet 2 MultithreadedDemo using Microsoft Parallel Patterns Library (requires MSVC compiler)" OFF)
-            ENDIF (MSVC)
-        ENDIF (BULLET2_USE_THREAD_LOCKS)
-
 ENDIF(BUILD_BULLET2_DEMOS)
 
 
diff --git a/examples/Benchmarks/BenchmarkDemo.cpp b/examples/Benchmarks/BenchmarkDemo.cpp
index d9a1b3ef0..ec443de6b 100644
--- a/examples/Benchmarks/BenchmarkDemo.cpp
+++ b/examples/Benchmarks/BenchmarkDemo.cpp
@@ -32,7 +32,6 @@ subject to the following restrictions:
 
 #include "LinearMath/btAlignedObjectArray.h"
 #include "LinearMath/btTransform.h"
-#include "../MultiThreadedDemo/ParallelFor.h"
 
 class btDynamicsWorld;
 
@@ -230,7 +229,7 @@ public:
         }
     }
 
-    struct CastRaysLoopBody
+    struct CastRaysLoopBody : public btIParallelForBody
     {
         btCollisionWorld* mWorld;
 		btRaycastBar2* mRaycasts;
@@ -274,7 +273,7 @@ public:
         {
             CastRaysLoopBody rayLooper(cw, this);
             int grainSize = 20;  // number of raycasts per task
-            parallelFor( 0, NUMRAYS, grainSize, rayLooper );
+            btParallelFor( 0, NUMRAYS, grainSize, rayLooper );
         }
         else
 #endif // USE_PARALLEL_RAYCASTS
diff --git a/examples/ExampleBrowser/CMakeLists.txt b/examples/ExampleBrowser/CMakeLists.txt
index 23c283fcb..4a4db93db 100644
--- a/examples/ExampleBrowser/CMakeLists.txt
+++ b/examples/ExampleBrowser/CMakeLists.txt
@@ -110,29 +110,6 @@ ELSE(WIN32)
 	ENDIF(APPLE)
 ENDIF(WIN32)
 
-IF (BULLET2_MULTITHREADED_OPEN_MP_DEMO)
-    ADD_DEFINITIONS("-DBT_USE_OPENMP=1")
-    IF (MSVC)
-        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
-    ELSE (MSVC)
-        # GCC, Clang
-        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-    ENDIF (MSVC)
-ENDIF (BULLET2_MULTITHREADED_OPEN_MP_DEMO)
-
-IF (BULLET2_MULTITHREADED_PPL_DEMO)
-    ADD_DEFINITIONS("-DBT_USE_PPL=1")
-ENDIF (BULLET2_MULTITHREADED_PPL_DEMO)
-
-IF (BULLET2_MULTITHREADED_TBB_DEMO)
-    SET (BULLET2_TBB_INCLUDE_DIR "not found" CACHE PATH "Directory for Intel TBB includes.")
-    SET (BULLET2_TBB_LIB_DIR "not found" CACHE PATH "Directory for Intel TBB libraries.")
-    find_library(TBB_LIBRARY tbb PATHS ${BULLET2_TBB_LIB_DIR})
-    find_library(TBBMALLOC_LIBRARY tbbmalloc PATHS ${BULLET2_TBB_LIB_DIR})
-    ADD_DEFINITIONS("-DBT_USE_TBB=1")
-    INCLUDE_DIRECTORIES( ${BULLET2_TBB_INCLUDE_DIR} )
-    LINK_LIBRARIES( ${TBB_LIBRARY} ${TBBMALLOC_LIBRARY} )
-ENDIF (BULLET2_MULTITHREADED_TBB_DEMO)
 
 SET(ExtendedTutorialsSources
 	../ExtendedTutorials/Chain.cpp 
@@ -207,7 +184,6 @@ SET(BulletExampleBrowser_SRCS
 	../MultiThreadedDemo/MultiThreadedDemo.h
 	../MultiThreadedDemo/CommonRigidBodyMTBase.cpp
 	../MultiThreadedDemo/CommonRigidBodyMTBase.h
-	../MultiThreadedDemo/ParallelFor.h
 	../Tutorial/Tutorial.cpp
 	../Tutorial/Tutorial.h
 	../Tutorial/Dof6ConstraintTutorial.cpp
@@ -386,7 +362,7 @@ ADD_CUSTOM_COMMAND(
                 COMMAND ${CMAKE_COMMAND} ARGS -E copy_directory ${BULLET_PHYSICS_SOURCE_DIR}/data ${PROJECT_BINARY_DIR}/data
         )
 
-IF (BULLET2_MULTITHREADED_TBB_DEMO AND WIN32)
+IF (BULLET2_USE_TBB_MULTITHREADING AND WIN32)
     # add a post build command to copy some dlls to the executable directory
     set(TBB_VC_VER "vc12")
     set(TBB_VC_ARCH "ia32")
@@ -400,7 +376,7 @@ IF (BULLET2_MULTITHREADED_TBB_DEMO AND WIN32)
     COMMAND ${CMAKE_COMMAND} -E copy_if_different
         "${BULLET2_TBB_INCLUDE_DIR}/../bin/${TBB_VC_ARCH}/${TBB_VC_VER}/tbbmalloc.dll"
         $<TARGET_FILE_DIR:App_ExampleBrowser>)
-ENDIF (BULLET2_MULTITHREADED_TBB_DEMO AND WIN32)
+ENDIF (BULLET2_USE_TBB_MULTITHREADING AND WIN32)
 
 
 IF (INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES)
diff --git a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
index 81446ee0b..82797e35e 100644
--- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
+++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
@@ -23,10 +23,10 @@ class btCollisionShape;
 
 #include "CommonRigidBodyMTBase.h"
 #include "../CommonInterfaces/CommonParameterInterface.h"
-#include "ParallelFor.h"
 #include "LinearMath/btAlignedObjectArray.h"
 #include "LinearMath/btPoolAllocator.h"
 #include "btBulletCollisionCommon.h"
+#include "BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h"
 #include "BulletDynamics/Dynamics/btSimulationIslandManagerMt.h"  // for setSplitIslands()
 #include "BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h"
 #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h"
@@ -36,20 +36,6 @@ class btCollisionShape;
 #include "BulletDynamics/MLCPSolvers/btDantzigSolver.h"
 #include "BulletDynamics/MLCPSolvers/btLemkeSolver.h"
 
-TaskManager gTaskMgr;
-
-#define USE_PARALLEL_NARROWPHASE 1  // detect collisions in parallel
-#define USE_PARALLEL_ISLAND_SOLVER 1   // solve simulation islands in parallel
-#define USE_PARALLEL_CREATE_PREDICTIVE_CONTACTS 1
-#define USE_PARALLEL_INTEGRATE_TRANSFORMS 1
-#define USE_PARALLEL_PREDICT_UNCONSTRAINED_MOTION 1
-
-#if defined (_MSC_VER) && _MSC_VER >= 1600
-// give us a compile error if any signatures of overriden methods is changed
-#define BT_OVERRIDE override
-#else
-#define BT_OVERRIDE
-#endif
 
 static int gNumIslands = 0;
 
@@ -124,7 +110,7 @@ public:
 };
 
 
-Profiler gProfiler;
+static Profiler gProfiler;
 
 class ProfileHelper
 {
@@ -141,457 +127,84 @@ public:
     }
 };
 
-int gThreadsRunningCounter = 0;
-btSpinMutex gThreadsRunningCounterMutex;
-
-void btPushThreadsAreRunning()
+static void profileBeginCallback( btDynamicsWorld *world, btScalar timeStep )
 {
-    gThreadsRunningCounterMutex.lock();
-    gThreadsRunningCounter++;
-    gThreadsRunningCounterMutex.unlock();
+    gProfiler.begin( Profiler::kRecordInternalTimeStep );
 }
 
-void btPopThreadsAreRunning()
+static void profileEndCallback( btDynamicsWorld *world, btScalar timeStep )
 {
-    gThreadsRunningCounterMutex.lock();
-    gThreadsRunningCounter--;
-    gThreadsRunningCounterMutex.unlock();
-}
-
-bool btThreadsAreRunning()
-{
-    return gThreadsRunningCounter != 0;
+    gProfiler.end( Profiler::kRecordInternalTimeStep );
 }
 
 
-#if USE_PARALLEL_NARROWPHASE
-
-class MyCollisionDispatcher : public btCollisionDispatcher
+///
+/// MyCollisionDispatcher -- subclassed for profiling purposes
+///
+class MyCollisionDispatcher : public btCollisionDispatcherMt
 {
-    btSpinMutex m_manifoldPtrsMutex;
-
+    typedef btCollisionDispatcherMt ParentClass;
 public:
-    MyCollisionDispatcher( btCollisionConfiguration* config ) : btCollisionDispatcher( config )
+    MyCollisionDispatcher( btCollisionConfiguration* config, int grainSize ) : btCollisionDispatcherMt( config, grainSize )
     {
     }
 
-    virtual ~MyCollisionDispatcher()
-    {
-    }
-
-    btPersistentManifold* getNewManifold( const btCollisionObject* body0, const btCollisionObject* body1 ) BT_OVERRIDE
-    {
-        // added spin-locks
-        //optional relative contact breaking threshold, turned on by default (use setDispatcherFlags to switch off feature for improved performance)
-
-        btScalar contactBreakingThreshold = ( m_dispatcherFlags & btCollisionDispatcher::CD_USE_RELATIVE_CONTACT_BREAKING_THRESHOLD ) ?
-            btMin( body0->getCollisionShape()->getContactBreakingThreshold( gContactBreakingThreshold ), body1->getCollisionShape()->getContactBreakingThreshold( gContactBreakingThreshold ) )
-            : gContactBreakingThreshold;
-
-        btScalar contactProcessingThreshold = btMin( body0->getContactProcessingThreshold(), body1->getContactProcessingThreshold() );
-
-        void* mem = m_persistentManifoldPoolAllocator->allocate( sizeof( btPersistentManifold ) );
-        if (NULL == mem)
-        {
-            //we got a pool memory overflow, by default we fallback to dynamically allocate memory. If we require a contiguous contact pool then assert.
-            if ( ( m_dispatcherFlags&CD_DISABLE_CONTACTPOOL_DYNAMIC_ALLOCATION ) == 0 )
-            {
-                mem = btAlignedAlloc( sizeof( btPersistentManifold ), 16 );
-            }
-            else
-            {
-                btAssert( 0 );
-                //make sure to increase the m_defaultMaxPersistentManifoldPoolSize in the btDefaultCollisionConstructionInfo/btDefaultCollisionConfiguration
-                return 0;
-            }
-        }
-        btPersistentManifold* manifold = new(mem) btPersistentManifold( body0, body1, 0, contactBreakingThreshold, contactProcessingThreshold );
-        m_manifoldPtrsMutex.lock();
-        manifold->m_index1a = m_manifoldsPtr.size();
-        m_manifoldsPtr.push_back( manifold );
-        m_manifoldPtrsMutex.unlock();
-
-        return manifold;
-    }
-
-    void releaseManifold( btPersistentManifold* manifold ) BT_OVERRIDE
-    {
-        clearManifold( manifold );
-
-        m_manifoldPtrsMutex.lock();
-        int findIndex = manifold->m_index1a;
-        btAssert( findIndex < m_manifoldsPtr.size() );
-        m_manifoldsPtr.swap( findIndex, m_manifoldsPtr.size() - 1 );
-        m_manifoldsPtr[ findIndex ]->m_index1a = findIndex;
-        m_manifoldsPtr.pop_back();
-        m_manifoldPtrsMutex.unlock();
-
-        manifold->~btPersistentManifold();
-        if ( m_persistentManifoldPoolAllocator->validPtr( manifold ) )
-        {
-            m_persistentManifoldPoolAllocator->freeMemory( manifold );
-        }
-        else
-        {
-            btAlignedFree( manifold );
-        }
-    }
-
-    struct Updater
-    {
-        btBroadphasePair* mPairArray;
-        btNearCallback mCallback;
-        btCollisionDispatcher* mDispatcher;
-        const btDispatcherInfo* mInfo;
-
-        Updater()
-        {
-            mPairArray = NULL;
-            mCallback = NULL;
-            mDispatcher = NULL;
-            mInfo = NULL;
-        }
-        void forLoop( int iBegin, int iEnd ) const
-        {
-            for ( int i = iBegin; i < iEnd; ++i )
-            {
-                btBroadphasePair* pair = &mPairArray[ i ];
-                mCallback( *pair, *mDispatcher, *mInfo );
-            }
-        }
-    };
-
     virtual void dispatchAllCollisionPairs( btOverlappingPairCache* pairCache, const btDispatcherInfo& info, btDispatcher* dispatcher ) BT_OVERRIDE
     {
-        ProfileHelper prof(Profiler::kRecordDispatchAllCollisionPairs);
-        int grainSize = 40;  // iterations per task
-        int pairCount = pairCache->getNumOverlappingPairs();
-        Updater updater;
-        updater.mCallback = getNearCallback();
-        updater.mPairArray = pairCount > 0 ? pairCache->getOverlappingPairArrayPtr() : NULL;
-        updater.mDispatcher = this;
-        updater.mInfo = &info;
-
-        btPushThreadsAreRunning();
-        parallelFor( 0, pairCount, grainSize, updater );
-        btPopThreadsAreRunning();
-        
-        if (m_manifoldsPtr.size() < 1)
-            return;
-
-        // reconstruct the manifolds array to ensure determinism
-        m_manifoldsPtr.resizeNoInitialize(0);
-        btBroadphasePair* pairs = pairCache->getOverlappingPairArrayPtr();
-        for (int i = 0; i < pairCount; ++i)
-        {
-            btCollisionAlgorithm* algo = pairs[i].m_algorithm;
-            if (algo) algo->getAllContactManifolds(m_manifoldsPtr);
-        }
-
-        // update the indices (used when releasing manifolds)
-        for (int i = 0; i < m_manifoldsPtr.size(); ++i)
-            m_manifoldsPtr[i]->m_index1a = i;
+        ProfileHelper prof( Profiler::kRecordDispatchAllCollisionPairs );
+        ParentClass::dispatchAllCollisionPairs( pairCache, info, dispatcher );
     }
 };
 
-#endif
 
-
-#if USE_PARALLEL_ISLAND_SOLVER
 ///
-/// MyConstraintSolverPool - masquerades as a constraint solver, but really it is a threadsafe pool of them.
-///
-///  Each solver in the pool is protected by a mutex.  When solveGroup is called from a thread,
-///  the pool looks for a solver that isn't being used by another thread, locks it, and dispatches the
-///  call to the solver.
-///  So long as there are at least as many solvers as there are hardware threads, it should never need to
-///  spin wait.
-///
-class MyConstraintSolverPool : public btConstraintSolver
+/// myParallelIslandDispatch -- wrap default parallel dispatch for profiling and to get the number of simulation islands
+//
+void myParallelIslandDispatch( btAlignedObjectArray<btSimulationIslandManagerMt::Island*>* islandsPtr, btSimulationIslandManagerMt::IslandCallback* callback )
 {
-    const static size_t kCacheLineSize = 128;
-    struct ThreadSolver
-    {
-        btConstraintSolver* solver;
-        btSpinMutex mutex;
-        char _cachelinePadding[ kCacheLineSize - sizeof( btSpinMutex ) - sizeof( void* ) ];  // keep mutexes from sharing a cache line
-    };
-    btAlignedObjectArray<ThreadSolver> m_solvers;
-    btConstraintSolverType m_solverType;
-
-    ThreadSolver* getAndLockThreadSolver()
-    {
-        while ( true )
-        {
-            for ( int i = 0; i < m_solvers.size(); ++i )
-            {
-                ThreadSolver& solver = m_solvers[ i ];
-                if ( solver.mutex.tryLock() )
-                {
-                    return &solver;
-                }
-            }
-        }
-        return NULL;
-    }
-    void init( btConstraintSolver** solvers, int numSolvers )
-    {
-        m_solverType = BT_SEQUENTIAL_IMPULSE_SOLVER;
-        m_solvers.resize( numSolvers );
-        for ( int i = 0; i < numSolvers; ++i )
-        {
-            m_solvers[ i ].solver = solvers[ i ];
-        }
-        if ( numSolvers > 0 )
-        {
-            m_solverType = solvers[ 0 ]->getSolverType();
-        }
-    }
-public:
-    // create the solvers for me
-    explicit MyConstraintSolverPool( int numSolvers )
-    {
-        btAlignedObjectArray<btConstraintSolver*> solvers;
-        solvers.reserve( numSolvers );
-        for ( int i = 0; i < numSolvers; ++i )
-        {
-            btConstraintSolver* solver = new btSequentialImpulseConstraintSolver();
-            solvers.push_back( solver );
-        }
-        init( &solvers[ 0 ], numSolvers );
-    }
-
-    // pass in fully constructed solvers (destructor will delete them)
-    MyConstraintSolverPool( btConstraintSolver** solvers, int numSolvers )
-    {
-        init( solvers, numSolvers );
-    }
-    virtual ~MyConstraintSolverPool()
-    {
-        // delete all solvers
-        for ( int i = 0; i < m_solvers.size(); ++i )
-        {
-            ThreadSolver& solver = m_solvers[ i ];
-            delete solver.solver;
-            solver.solver = NULL;
-        }
-    }
-
-    //virtual void prepareSolve( int /* numBodies */, int /* numManifolds */ ) { ; } // does nothing
-
-    ///solve a group of constraints
-    virtual btScalar solveGroup( btCollisionObject** bodies,
-                                 int numBodies,
-                                 btPersistentManifold** manifolds,
-                                 int numManifolds,
-                                 btTypedConstraint** constraints,
-                                 int numConstraints,
-                                 const btContactSolverInfo& info,
-                                 btIDebugDraw* debugDrawer,
-                                 btDispatcher* dispatcher
-                                 )
-    {
-        ThreadSolver* solver = getAndLockThreadSolver();
-        solver->solver->solveGroup( bodies, numBodies, manifolds, numManifolds, constraints, numConstraints, info, debugDrawer, dispatcher );
-        solver->mutex.unlock();
-        return 0.0f;
-    }
-
-    //virtual void allSolved( const btContactSolverInfo& /* info */, class btIDebugDraw* /* debugDrawer */ ) { ; } // does nothing
-
-    ///clear internal cached data and reset random seed
-    virtual	void reset()
-    {
-        for ( int i = 0; i < m_solvers.size(); ++i )
-        {
-            ThreadSolver& solver = m_solvers[ i ];
-            solver.mutex.lock();
-            solver.solver->reset();
-            solver.mutex.unlock();
-        }
-    }
-
-    virtual btConstraintSolverType getSolverType() const
-    {
-        return m_solverType;
-    }
-};
-
-struct UpdateIslandDispatcher
-{
-    btAlignedObjectArray<btSimulationIslandManagerMt::Island*>* islandsPtr;
-    btSimulationIslandManagerMt::IslandCallback* callback;
-
-    void forLoop( int iBegin, int iEnd ) const
-    {
-        for ( int i = iBegin; i < iEnd; ++i )
-        {
-            btSimulationIslandManagerMt::Island* island = ( *islandsPtr )[ i ];
-            btPersistentManifold** manifolds = island->manifoldArray.size() ? &island->manifoldArray[ 0 ] : NULL;
-            btTypedConstraint** constraintsPtr = island->constraintArray.size() ? &island->constraintArray[ 0 ] : NULL;
-            callback->processIsland( &island->bodyArray[ 0 ],
-                                     island->bodyArray.size(),
-                                     manifolds,
-                                     island->manifoldArray.size(),
-                                     constraintsPtr,
-                                     island->constraintArray.size(),
-                                     island->id
-                                     );
-        }
-    }
-};
-
-void parallelIslandDispatch( btAlignedObjectArray<btSimulationIslandManagerMt::Island*>* islandsPtr, btSimulationIslandManagerMt::IslandCallback* callback )
-{
-    ProfileHelper prof(Profiler::kRecordDispatchIslands);
+    ProfileHelper prof( Profiler::kRecordDispatchIslands );
     gNumIslands = islandsPtr->size();
-    int grainSize = 1;  // iterations per task
-    UpdateIslandDispatcher dispatcher;
-    dispatcher.islandsPtr = islandsPtr;
-    dispatcher.callback = callback;
-    btPushThreadsAreRunning();
-    parallelFor( 0, islandsPtr->size(), grainSize, dispatcher );
-    btPopThreadsAreRunning();
-}
-#endif //#if USE_PARALLEL_ISLAND_SOLVER
-
-
-void profileBeginCallback(btDynamicsWorld *world, btScalar timeStep)
-{
-    gProfiler.begin(Profiler::kRecordInternalTimeStep);
+    btSimulationIslandManagerMt::parallelIslandDispatch( islandsPtr, callback );
 }
 
-void profileEndCallback(btDynamicsWorld *world, btScalar timeStep)
-{
-    gProfiler.end(Profiler::kRecordInternalTimeStep);
-}
 
 ///
-/// MyDiscreteDynamicsWorld
-///
-///  Should function exactly like btDiscreteDynamicsWorld.
-///  3 methods that iterate over all of the rigidbodies can run in parallel:
-///     - predictUnconstraintMotion
-///     - integrateTransforms
-///     - createPredictiveContacts
+/// MyDiscreteDynamicsWorld -- subclassed for profiling purposes
 ///
 ATTRIBUTE_ALIGNED16( class ) MyDiscreteDynamicsWorld : public btDiscreteDynamicsWorldMt
 {
     typedef btDiscreteDynamicsWorld ParentClass;
 
 protected:
-#if USE_PARALLEL_PREDICT_UNCONSTRAINED_MOTION
-    struct UpdaterUnconstrainedMotion
-    {
-        btScalar timeStep;
-        btRigidBody** rigidBodies;
-
-        void forLoop( int iBegin, int iEnd ) const
-        {
-            for ( int i = iBegin; i < iEnd; ++i )
-            {
-                btRigidBody* body = rigidBodies[ i ];
-                if ( !body->isStaticOrKinematicObject() )
-                {
-                    //don't integrate/update velocities here, it happens in the constraint solver
-                    body->applyDamping( timeStep );
-                    body->predictIntegratedTransform( timeStep, body->getInterpolationWorldTransform() );
-                }
-            }
-        }
-    };
 
     virtual void predictUnconstraintMotion( btScalar timeStep ) BT_OVERRIDE
     {
         ProfileHelper prof( Profiler::kRecordPredictUnconstrainedMotion );
-        BT_PROFILE( "predictUnconstraintMotion" );
-        int grainSize = 50;  // num of iterations per task for TBB
-        int bodyCount = m_nonStaticRigidBodies.size();
-        UpdaterUnconstrainedMotion update;
-        update.timeStep = timeStep;
-        update.rigidBodies = bodyCount ? &m_nonStaticRigidBodies[ 0 ] : NULL;
-        btPushThreadsAreRunning();
-        parallelFor( 0, bodyCount, grainSize, update );
-        btPopThreadsAreRunning();
+        ParentClass::predictUnconstraintMotion( timeStep );
     }
-#endif // #if USE_PARALLEL_PREDICT_UNCONSTRAINED_MOTION
-
-#if USE_PARALLEL_CREATE_PREDICTIVE_CONTACTS
-    struct UpdaterCreatePredictiveContacts
-    {
-        btScalar timeStep;
-        btRigidBody** rigidBodies;
-        MyDiscreteDynamicsWorld* world;
-
-        void forLoop( int iBegin, int iEnd ) const
-        {
-            world->createPredictiveContactsInternal( &rigidBodies[ iBegin ], iEnd - iBegin, timeStep );
-        }
-    };
-
-    virtual void createPredictiveContacts( btScalar timeStep )
+    virtual void createPredictiveContacts( btScalar timeStep ) BT_OVERRIDE
     {
         ProfileHelper prof( Profiler::kRecordCreatePredictiveContacts );
-        releasePredictiveContacts();
-        int grainSize = 50;  // num of iterations per task for TBB or OPENMP
-        if ( int bodyCount = m_nonStaticRigidBodies.size() )
-        {
-            UpdaterCreatePredictiveContacts update;
-            update.world = this;
-            update.timeStep = timeStep;
-            update.rigidBodies = &m_nonStaticRigidBodies[ 0 ];
-            btPushThreadsAreRunning();
-            parallelFor( 0, bodyCount, grainSize, update );
-            btPopThreadsAreRunning();
-        }
+        ParentClass::createPredictiveContacts( timeStep );
     }
-#endif // #if USE_PARALLEL_CREATE_PREDICTIVE_CONTACTS
-
-#if USE_PARALLEL_INTEGRATE_TRANSFORMS
-    struct UpdaterIntegrateTransforms
-    {
-        btScalar timeStep;
-        btRigidBody** rigidBodies;
-        MyDiscreteDynamicsWorld* world;
-
-        void forLoop( int iBegin, int iEnd ) const
-        {
-            world->integrateTransformsInternal( &rigidBodies[ iBegin ], iEnd - iBegin, timeStep );
-        }
-    };
-
     virtual void integrateTransforms( btScalar timeStep ) BT_OVERRIDE
     {
         ProfileHelper prof( Profiler::kRecordIntegrateTransforms );
-        BT_PROFILE( "integrateTransforms" );
-        int grainSize = 50;  // num of iterations per task for TBB or OPENMP
-        if ( int bodyCount = m_nonStaticRigidBodies.size() )
-        {
-            UpdaterIntegrateTransforms update;
-            update.world = this;
-            update.timeStep = timeStep;
-            update.rigidBodies = &m_nonStaticRigidBodies[ 0 ];
-            btPushThreadsAreRunning();
-            parallelFor( 0, bodyCount, grainSize, update );
-            btPopThreadsAreRunning();
-        }
+        ParentClass::integrateTransforms( timeStep );
     }
-#endif // #if USE_PARALLEL_INTEGRATE_TRANSFORMS
 
 public:
     BT_DECLARE_ALIGNED_ALLOCATOR();
 
     MyDiscreteDynamicsWorld( btDispatcher* dispatcher,
                              btBroadphaseInterface* pairCache,
-                             btConstraintSolver* constraintSolver,
+                             btConstraintSolverPoolMt* constraintSolver,
                              btCollisionConfiguration* collisionConfiguration
                              ) :
                              btDiscreteDynamicsWorldMt( dispatcher, pairCache, constraintSolver, collisionConfiguration )
     {
-#if USE_PARALLEL_ISLAND_SOLVER
         btSimulationIslandManagerMt* islandMgr = static_cast<btSimulationIslandManagerMt*>( m_islandManager );
-        islandMgr->setIslandDispatchFunction( parallelIslandDispatch );
-#endif //#if USE_PARALLEL_ISLAND_SOLVER
+        islandMgr->setIslandDispatchFunction( myParallelIslandDispatch );
     }
 
 };
@@ -625,6 +238,47 @@ btConstraintSolver* createSolverByType( SolverType t )
 }
 
 
+///
+/// btTaskSchedulerManager -- manage a number of task schedulers so we can switch between them
+///
+class btTaskSchedulerManager
+{
+    btAlignedObjectArray<btITaskScheduler*> m_taskSchedulers;
+
+public:
+    btTaskSchedulerManager() {}
+    void init()
+    {
+        addTaskScheduler( btGetSequentialTaskScheduler() );
+        addTaskScheduler( btGetOpenMPTaskScheduler() );
+        addTaskScheduler( btGetTBBTaskScheduler() );
+        addTaskScheduler( btGetPPLTaskScheduler() );
+        if ( getNumTaskSchedulers() > 1 )
+        {
+            // prefer a non-sequential scheduler if available
+            btSetTaskScheduler( m_taskSchedulers[ 1 ] );
+        }
+        else
+        {
+            btSetTaskScheduler( m_taskSchedulers[ 0 ] );
+        }
+        btGetTaskScheduler()->setNumThreads( btGetTaskScheduler()->getMaxNumThreads() );
+    }
+
+    void addTaskScheduler( btITaskScheduler* ts )
+    {
+        if ( ts )
+        {
+            m_taskSchedulers.push_back( ts );
+        }
+    }
+    int getNumTaskSchedulers() const { return m_taskSchedulers.size(); }
+    btITaskScheduler* getTaskScheduler( int i ) { return m_taskSchedulers[ i ]; }
+};
+
+
+static btTaskSchedulerManager gTaskSchedulerMgr;
+
 static bool gMultithreadedWorld = false;
 static bool gDisplayProfileInfo = false;
 static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE;
@@ -652,15 +306,17 @@ CommonRigidBodyMTBase::CommonRigidBodyMTBase( struct GUIHelperInterface* helper
 {
     m_multithreadedWorld = false;
     m_multithreadCapable = false;
-    gTaskMgr.init();
+    if ( gTaskSchedulerMgr.getNumTaskSchedulers() == 0 )
+    {
+        gTaskSchedulerMgr.init();
+    }
 }
 
 CommonRigidBodyMTBase::~CommonRigidBodyMTBase()
 {
-    gTaskMgr.shutdown();
 }
 
-void boolPtrButtonCallback(int buttonId, bool buttonState, void* userPointer)
+static void boolPtrButtonCallback(int buttonId, bool buttonState, void* userPointer)
 {
     if (bool* val = static_cast<bool*>(userPointer))
     {
@@ -668,7 +324,7 @@ void boolPtrButtonCallback(int buttonId, bool buttonState, void* userPointer)
     }
 }
 
-void toggleSolverModeCallback(int buttonId, bool buttonState, void* userPointer)
+static void toggleSolverModeCallback(int buttonId, bool buttonState, void* userPointer)
 {
     if (buttonState)
     {
@@ -687,7 +343,7 @@ void toggleSolverModeCallback(int buttonId, bool buttonState, void* userPointer)
     }
 }
 
-void setSolverTypeCallback(int buttonId, bool buttonState, void* userPointer)
+static void setSolverTypeCallback(int buttonId, bool buttonState, void* userPointer)
 {
     if (buttonId >= 0 && buttonId < SOLVER_TYPE_COUNT)
     {
@@ -695,32 +351,30 @@ void setSolverTypeCallback(int buttonId, bool buttonState, void* userPointer)
     }
 }
 
-void apiSelectButtonCallback(int buttonId, bool buttonState, void* userPointer)
+static void setNumThreads( int numThreads )
 {
-    gTaskMgr.setApi(static_cast<TaskManager::Api>(buttonId));
-    if (gTaskMgr.getApi()==TaskManager::apiNone)
+    int newNumThreads = ( std::min )( numThreads, int( BT_MAX_THREAD_COUNT ) );
+    int oldNumThreads = btGetTaskScheduler()->getNumThreads();
+    // only call when the thread count is different
+    if ( newNumThreads != oldNumThreads )
     {
-        gSliderNumThreads = 1.0f;
-    }
-    else
-    {
-        gSliderNumThreads = float(gTaskMgr.getNumThreads());
+        btGetTaskScheduler()->setNumThreads( newNumThreads );
     }
 }
 
-void setThreadCountCallback(float val, void* userPtr)
+static void apiSelectButtonCallback(int buttonId, bool buttonState, void* userPointer)
 {
-    if (gTaskMgr.getApi()==TaskManager::apiNone)
-    {
-        gSliderNumThreads = 1.0f;
-    }
-    else
-    {
-        gTaskMgr.setNumThreads( int( gSliderNumThreads ) );
-    }
+    // change the task scheduler
+    btSetTaskScheduler( gTaskSchedulerMgr.getTaskScheduler( buttonId ) );
+    setNumThreads( int( gSliderNumThreads ) );
 }
 
-void setSolverIterationCountCallback(float val, void* userPtr)
+static void setThreadCountCallback(float val, void* userPtr)
+{
+    setNumThreads( int( gSliderNumThreads ) );
+}
+
+static void setSolverIterationCountCallback(float val, void* userPtr)
 {
     if (btDiscreteDynamicsWorld* world = reinterpret_cast<btDiscreteDynamicsWorld*>(userPtr))
     {
@@ -733,6 +387,7 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
     gNumIslands = 0;
     m_solverType = gSolverType;
 #if BT_THREADSAFE && (BT_USE_OPENMP || BT_USE_PPL || BT_USE_TBB)
+    btAssert( btGetTaskScheduler() != NULL );
     m_multithreadCapable = true;
 #endif
     if ( gMultithreadedWorld )
@@ -743,30 +398,24 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
         cci.m_defaultMaxCollisionAlgorithmPoolSize = 80000;
         m_collisionConfiguration = new btDefaultCollisionConfiguration( cci );
 
-#if USE_PARALLEL_NARROWPHASE
-        m_dispatcher = new	MyCollisionDispatcher( m_collisionConfiguration );
-#else
-        m_dispatcher = new	btCollisionDispatcher( m_collisionConfiguration );
-#endif //USE_PARALLEL_NARROWPHASE
-
+        m_dispatcher = new MyCollisionDispatcher( m_collisionConfiguration, 40 );
         m_broadphase = new btDbvtBroadphase();
 
-#if BT_THREADSAFE && USE_PARALLEL_ISLAND_SOLVER
+        btConstraintSolverPoolMt* solverPool;
         {
             btConstraintSolver* solvers[ BT_MAX_THREAD_COUNT ];
-            int maxThreadCount = btMin( int(BT_MAX_THREAD_COUNT), TaskManager::getMaxNumThreads() );
+            int maxThreadCount = BT_MAX_THREAD_COUNT;
             for ( int i = 0; i < maxThreadCount; ++i )
             {
                 solvers[ i ] = createSolverByType( m_solverType );
             }
-            m_solver = new MyConstraintSolverPool( solvers, maxThreadCount );
+            solverPool = new btConstraintSolverPoolMt( solvers, maxThreadCount );
+            m_solver = solverPool;
         }
-#else
-        m_solver = createSolverByType( m_solverType );
-#endif //#if USE_PARALLEL_ISLAND_SOLVER
-        btDiscreteDynamicsWorld* world = new MyDiscreteDynamicsWorld( m_dispatcher, m_broadphase, m_solver, m_collisionConfiguration );
+        btDiscreteDynamicsWorld* world = new MyDiscreteDynamicsWorld( m_dispatcher, m_broadphase, solverPool, m_collisionConfiguration );
         m_dynamicsWorld = world;
         m_multithreadedWorld = true;
+        btAssert( btGetTaskScheduler() != NULL );
     }
     else
     {
@@ -886,24 +535,25 @@ void CommonRigidBodyMTBase::createDefaultParameters()
     if (m_multithreadedWorld)
     {
         // create a button for each supported threading API
-        for (int iApi = 0; iApi < TaskManager::apiCount; ++iApi)
+        for ( int iApi = 0; iApi < gTaskSchedulerMgr.getNumTaskSchedulers(); ++iApi )
         {
-            TaskManager::Api api = static_cast<TaskManager::Api>(iApi);
-            if (gTaskMgr.isSupported(api))
-            {
-                char str[1024];
-                sprintf(str, "API %s", gTaskMgr.getApiName(api));
-                ButtonParams button( str, iApi, false );
-                button.m_callback = apiSelectButtonCallback;
-                m_guiHelper->getParameterInterface()->registerButtonParameter( button );
-            }
+            char str[ 1024 ];
+            sprintf( str, "API %s", gTaskSchedulerMgr.getTaskScheduler(iApi)->getName() );
+            ButtonParams button( str, iApi, false );
+            button.m_callback = apiSelectButtonCallback;
+            m_guiHelper->getParameterInterface()->registerButtonParameter( button );
         }
         {
             // create a slider to set the number of threads to use
-            gSliderNumThreads = float(gTaskMgr.getNumThreads());
+            int numThreads = btGetTaskScheduler()->getNumThreads();
+            // if slider has not been set yet (by another demo),
+            if ( gSliderNumThreads <= 1.0f )
+            {
+                gSliderNumThreads = float( numThreads );
+            }
 			SliderParams slider("Thread count", &gSliderNumThreads);
 			slider.m_minVal = 1.0f;
-			slider.m_maxVal = float(gTaskMgr.getMaxNumThreads()*2);
+			slider.m_maxVal = float( BT_MAX_THREAD_COUNT );
 			slider.m_callback = setThreadCountCallback;
             slider.m_clampToIntegers = true;
             m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
@@ -946,14 +596,14 @@ void CommonRigidBodyMTBase::drawScreenText()
                 const btPersistentManifold* man = m_dispatcher->getManifoldByIndexInternal( i );
                 numContacts += man->getNumContacts();
             }
-            const char* mtApi = TaskManager::getApiName( gTaskMgr.getApi() );
+            const char* mtApi = btGetTaskScheduler()->getName();
             sprintf( msg, "islands=%d bodies=%d manifolds=%d contacts=%d [%s] threads=%d",
                      gNumIslands,
                      m_dynamicsWorld->getNumCollisionObjects(),
                      numManifolds,
                      numContacts,
                      mtApi,
-                     gTaskMgr.getApi() == TaskManager::apiNone ? 1 : gTaskMgr.getNumThreads()
+                     btGetTaskScheduler()->getNumThreads()
                      );
             m_guiHelper->getAppInterface()->drawText( msg, 100, yCoord, 0.4f );
             yCoord += yStep;
diff --git a/examples/MultiThreadedDemo/ParallelFor.h b/examples/MultiThreadedDemo/ParallelFor.h
deleted file mode 100644
index cb1325025..000000000
--- a/examples/MultiThreadedDemo/ParallelFor.h
+++ /dev/null
@@ -1,336 +0,0 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#include <stdio.h> //printf debugging
-#include <algorithm>
-
-
-// choose threading providers:
-#if BT_USE_TBB
-#define USE_TBB 1     // use Intel Threading Building Blocks for thread management
-#endif
-
-#if BT_USE_PPL
-#define USE_PPL 1     // use Microsoft Parallel Patterns Library (installed with Visual Studio 2010 and later)
-#endif // BT_USE_PPL
-
-#if BT_USE_OPENMP
-#define USE_OPENMP 1  // use OpenMP (also need to change compiler options for OpenMP support)
-#endif
-
-
-#if USE_OPENMP
-
-#include <omp.h>
-
-#endif // #if USE_OPENMP
-
-
-#if USE_PPL
-
-#include <ppl.h>  // if you get a compile error here, check whether your version of Visual Studio includes PPL
-// Visual Studio 2010 and later should come with it
-#include <concrtrm.h>  // for GetProcessorCount()
-#endif // #if USE_PPL
-
-
-#if USE_TBB
-
-#define __TBB_NO_IMPLICIT_LINKAGE 1
-#include <tbb/tbb.h>
-#include <tbb/task_scheduler_init.h>
-#include <tbb/parallel_for.h>
-#include <tbb/blocked_range.h>
-
-#endif // #if USE_TBB
-
-
-
-class TaskManager
-{
-public:
-    enum Api
-    {
-        apiNone,
-        apiOpenMP,
-        apiTbb,
-        apiPpl,
-        apiCount
-    };
-    static const char* getApiName( Api api )
-    {
-        switch ( api )
-        {
-        case apiNone: return "None";
-        case apiOpenMP: return "OpenMP";
-        case apiTbb: return "Intel TBB";
-        case apiPpl: return "MS PPL";
-        default: return "unknown";
-        }
-    }
-
-    TaskManager()
-    {
-        m_api = apiNone;
-        m_numThreads = 0;
-#if USE_TBB
-        m_tbbSchedulerInit = NULL;
-#endif // #if USE_TBB
-    }
-
-    Api getApi() const
-    {
-        return m_api;
-    }
-
-    bool isSupported( Api api ) const
-    {
-#if USE_OPENMP
-        if ( api == apiOpenMP )
-        {
-            return true;
-        }
-#endif
-#if USE_TBB
-        if ( api == apiTbb )
-        {
-            return true;
-        }
-#endif
-#if USE_PPL
-        if ( api == apiPpl )
-        {
-            return true;
-        }
-#endif
-        // apiNone is always "supported"
-        return api == apiNone;
-    }
-
-    void setApi( Api api )
-    {
-        if (isSupported(api))
-        {
-            m_api = api;
-        }
-        else
-        {
-            // no compile time support for selected API, fallback to "none"
-            m_api = apiNone;
-        }
-    }
-
-    static int getMaxNumThreads()
-    {
-#if USE_OPENMP
-        return omp_get_max_threads();
-#elif USE_PPL
-        return concurrency::GetProcessorCount();
-#elif USE_TBB
-        return tbb::task_scheduler_init::default_num_threads();
-#endif
-        return 1;
-    }
-
-    int getNumThreads() const
-    {
-        return m_numThreads;
-    }
-
-    int setNumThreads( int numThreads )
-    {
-        m_numThreads = ( std::max )( 1, numThreads );
-
-#if USE_OPENMP
-        omp_set_num_threads( m_numThreads );
-#endif
-
-#if USE_PPL
-        {
-            using namespace concurrency;
-            if ( CurrentScheduler::Id() != -1 )
-            {
-                CurrentScheduler::Detach();
-            }
-            SchedulerPolicy policy;
-            policy.SetConcurrencyLimits( m_numThreads, m_numThreads );
-            CurrentScheduler::Create( policy );
-        }
-#endif
-
-#if USE_TBB
-        if ( m_tbbSchedulerInit )
-        {
-            delete m_tbbSchedulerInit;
-            m_tbbSchedulerInit = NULL;
-        }
-        m_tbbSchedulerInit = new tbb::task_scheduler_init( m_numThreads );
-#endif
-        return m_numThreads;
-    }
-
-    void init()
-    {
-        if (m_numThreads == 0)
-        {
-#if USE_PPL
-            setApi( apiPpl );
-#endif
-#if USE_TBB
-            setApi( apiTbb );
-#endif
-#if USE_OPENMP
-            setApi( apiOpenMP );
-#endif
-            setNumThreads(getMaxNumThreads());
-        }
-        else
-        {
-            setNumThreads(m_numThreads);
-        }
-    }
-
-    void shutdown()
-    {
-#if USE_TBB
-        if ( m_tbbSchedulerInit )
-        {
-            delete m_tbbSchedulerInit;
-            m_tbbSchedulerInit = NULL;
-        }
-#endif
-    }
-
-private:
-    Api m_api;
-    int m_numThreads;
-#if USE_TBB
-    tbb::task_scheduler_init* m_tbbSchedulerInit;
-#endif // #if USE_TBB
-};
-
-extern TaskManager gTaskMgr;
-
-
-inline static void initTaskScheduler()
-{
-    gTaskMgr.init();
-}
-
-inline static void cleanupTaskScheduler()
-{
-    gTaskMgr.shutdown();
-}
-
-
-#if USE_TBB
-///
-/// TbbBodyAdapter -- Converts a body object that implements the
-///                   "forLoop(int iBegin, int iEnd) const" function
-///  into a TBB compatible object that takes a tbb::blocked_range<int> type.
-///
-template <class TBody>
-struct TbbBodyAdapter
-{
-    const TBody* mBody;
-
-    void operator()( const tbb::blocked_range<int>& range ) const
-    {
-        mBody->forLoop( range.begin(), range.end() );
-    }
-};
-#endif // #if USE_TBB
-
-#if USE_PPL
-///
-/// PplBodyAdapter -- Converts a body object that implements the
-///                   "forLoop(int iBegin, int iEnd) const" function
-///  into a PPL compatible object that implements "void operator()( int ) const"
-///
-template <class TBody>
-struct PplBodyAdapter
-{
-    const TBody* mBody;
-    int mGrainSize;
-    int mIndexEnd;
-
-    void operator()( int i ) const
-    {
-        mBody->forLoop( i, (std::min)(i + mGrainSize, mIndexEnd) );
-    }
-};
-#endif // #if USE_PPL
-
-
-///
-/// parallelFor -- interface for submitting work expressed as a for loop to the worker threads
-///
-template <class TBody>
-void parallelFor( int iBegin, int iEnd, int grainSize, const TBody& body )
-{
-#if USE_OPENMP
-    if ( gTaskMgr.getApi() == TaskManager::apiOpenMP )
-    {
-#pragma omp parallel for schedule(static, 1)
-        for ( int i = iBegin; i < iEnd; i += grainSize )
-        {
-            body.forLoop( i, (std::min)( i + grainSize, iEnd ) );
-        }
-        return;
-    }
-#endif // #if USE_OPENMP
-
-#if USE_PPL
-    if ( gTaskMgr.getApi() == TaskManager::apiPpl )
-    {
-        // PPL dispatch
-        PplBodyAdapter<TBody> pplBody;
-        pplBody.mBody = &body;
-        pplBody.mGrainSize = grainSize;
-        pplBody.mIndexEnd = iEnd;
-        // note: MSVC 2010 doesn't support partitioner args, so avoid them
-        concurrency::parallel_for( iBegin,
-                                   iEnd,
-                                   grainSize,
-                                   pplBody
-                                   );
-        return;
-    }
-#endif //#if USE_PPL
-
-#if USE_TBB
-    if ( gTaskMgr.getApi() == TaskManager::apiTbb )
-    {
-        // TBB dispatch
-        TbbBodyAdapter<TBody> tbbBody;
-        tbbBody.mBody = &body;
-        tbb::parallel_for( tbb::blocked_range<int>( iBegin, iEnd, grainSize ),
-                           tbbBody,
-                           tbb::simple_partitioner()
-                           );
-        return;
-    }
-#endif // #if USE_TBB
-
-    {
-        // run on main thread
-        body.forLoop( iBegin, iEnd );
-    }
-
-}
-
-
-
-
diff --git a/src/BulletCollision/CMakeLists.txt b/src/BulletCollision/CMakeLists.txt
index 67afc7e0c..85c5fc8b6 100644
--- a/src/BulletCollision/CMakeLists.txt
+++ b/src/BulletCollision/CMakeLists.txt
@@ -15,6 +15,7 @@ SET(BulletCollision_SRCS
 	CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp
 	CollisionDispatch/btBoxBoxDetector.cpp
 	CollisionDispatch/btCollisionDispatcher.cpp
+	CollisionDispatch/btCollisionDispatcherMt.cpp
 	CollisionDispatch/btCollisionObject.cpp
 	CollisionDispatch/btCollisionWorld.cpp
 	CollisionDispatch/btCollisionWorldImporter.cpp
@@ -123,6 +124,7 @@ SET(CollisionDispatch_HDRS
 	CollisionDispatch/btCollisionConfiguration.h
 	CollisionDispatch/btCollisionCreateFunc.h
 	CollisionDispatch/btCollisionDispatcher.h
+	CollisionDispatch/btCollisionDispatcherMt.h
 	CollisionDispatch/btCollisionObject.h
 	CollisionDispatch/btCollisionObjectWrapper.h
 	CollisionDispatch/btCollisionWorld.h
diff --git a/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.cpp b/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.cpp
new file mode 100644
index 000000000..57cd050e5
--- /dev/null
+++ b/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.cpp
@@ -0,0 +1,164 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#include "btCollisionDispatcherMt.h"
+#include "LinearMath/btQuickprof.h"
+
+#include "BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h"
+
+#include "BulletCollision/CollisionShapes/btCollisionShape.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
+#include "LinearMath/btPoolAllocator.h"
+#include "BulletCollision/CollisionDispatch/btCollisionConfiguration.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObjectWrapper.h"
+
+
+btCollisionDispatcherMt::btCollisionDispatcherMt( btCollisionConfiguration* config, int grainSize )
+    : btCollisionDispatcher( config )
+{
+    m_batchUpdating = false;
+    m_grainSize = grainSize;  // iterations per task
+}
+
+
+btPersistentManifold* btCollisionDispatcherMt::getNewManifold( const btCollisionObject* body0, const btCollisionObject* body1 )
+{
+    //optional relative contact breaking threshold, turned on by default (use setDispatcherFlags to switch off feature for improved performance)
+
+    btScalar contactBreakingThreshold = ( m_dispatcherFlags & btCollisionDispatcher::CD_USE_RELATIVE_CONTACT_BREAKING_THRESHOLD ) ?
+        btMin( body0->getCollisionShape()->getContactBreakingThreshold( gContactBreakingThreshold ), body1->getCollisionShape()->getContactBreakingThreshold( gContactBreakingThreshold ) )
+        : gContactBreakingThreshold;
+
+    btScalar contactProcessingThreshold = btMin( body0->getContactProcessingThreshold(), body1->getContactProcessingThreshold() );
+
+    void* mem = m_persistentManifoldPoolAllocator->allocate( sizeof( btPersistentManifold ) );
+    if ( NULL == mem )
+    {
+        //we got a pool memory overflow, by default we fallback to dynamically allocate memory. If we require a contiguous contact pool then assert.
+        if ( ( m_dispatcherFlags&CD_DISABLE_CONTACTPOOL_DYNAMIC_ALLOCATION ) == 0 )
+        {
+            mem = btAlignedAlloc( sizeof( btPersistentManifold ), 16 );
+        }
+        else
+        {
+            btAssert( 0 );
+            //make sure to increase the m_defaultMaxPersistentManifoldPoolSize in the btDefaultCollisionConstructionInfo/btDefaultCollisionConfiguration
+            return 0;
+        }
+    }
+    btPersistentManifold* manifold = new( mem ) btPersistentManifold( body0, body1, 0, contactBreakingThreshold, contactProcessingThreshold );
+    if ( !m_batchUpdating )
+    {
+        // batch updater will update manifold pointers array after finishing, so
+        // only need to update array when not batch-updating
+        btAssert( !btThreadsAreRunning() );
+        manifold->m_index1a = m_manifoldsPtr.size();
+        m_manifoldsPtr.push_back( manifold );
+    }
+
+    return manifold;
+}
+
+void btCollisionDispatcherMt::releaseManifold( btPersistentManifold* manifold )
+{
+    clearManifold( manifold );
+    btAssert( !btThreadsAreRunning() );
+    if ( !m_batchUpdating )
+    {
+        // batch updater will update manifold pointers array after finishing, so
+        // only need to update array when not batch-updating
+        int findIndex = manifold->m_index1a;
+        btAssert( findIndex < m_manifoldsPtr.size() );
+        m_manifoldsPtr.swap( findIndex, m_manifoldsPtr.size() - 1 );
+        m_manifoldsPtr[ findIndex ]->m_index1a = findIndex;
+        m_manifoldsPtr.pop_back();
+    }
+
+    manifold->~btPersistentManifold();
+    if ( m_persistentManifoldPoolAllocator->validPtr( manifold ) )
+    {
+        m_persistentManifoldPoolAllocator->freeMemory( manifold );
+    }
+    else
+    {
+        btAlignedFree( manifold );
+    }
+}
+
+struct CollisionDispatcherUpdater : public btIParallelForBody
+{
+    btBroadphasePair* mPairArray;
+    btNearCallback mCallback;
+    btCollisionDispatcher* mDispatcher;
+    const btDispatcherInfo* mInfo;
+
+    CollisionDispatcherUpdater()
+    {
+        mPairArray = NULL;
+        mCallback = NULL;
+        mDispatcher = NULL;
+        mInfo = NULL;
+    }
+    void forLoop( int iBegin, int iEnd ) const
+    {
+        for ( int i = iBegin; i < iEnd; ++i )
+        {
+            btBroadphasePair* pair = &mPairArray[ i ];
+            mCallback( *pair, *mDispatcher, *mInfo );
+        }
+    }
+};
+
+
+void btCollisionDispatcherMt::dispatchAllCollisionPairs( btOverlappingPairCache* pairCache, const btDispatcherInfo& info, btDispatcher* dispatcher )
+{
+    int pairCount = pairCache->getNumOverlappingPairs();
+    if ( pairCount == 0 )
+    {
+        return;
+    }
+    CollisionDispatcherUpdater updater;
+    updater.mCallback = getNearCallback();
+    updater.mPairArray = pairCache->getOverlappingPairArrayPtr();
+    updater.mDispatcher = this;
+    updater.mInfo = &info;
+
+    m_batchUpdating = true;
+    btParallelFor( 0, pairCount, m_grainSize, updater );
+    m_batchUpdating = false;
+
+    // reconstruct the manifolds array to ensure determinism
+    m_manifoldsPtr.resizeNoInitialize( 0 );
+
+    btBroadphasePair* pairs = pairCache->getOverlappingPairArrayPtr();
+    for ( int i = 0; i < pairCount; ++i )
+    {
+        if (btCollisionAlgorithm* algo = pairs[ i ].m_algorithm)
+        {
+            algo->getAllContactManifolds( m_manifoldsPtr );
+        }
+    }
+
+    // update the indices (used when releasing manifolds)
+    for ( int i = 0; i < m_manifoldsPtr.size(); ++i )
+    {
+        m_manifoldsPtr[ i ]->m_index1a = i;
+    }
+}
+
+
diff --git a/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h b/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h
new file mode 100644
index 000000000..f1d7eafdc
--- /dev/null
+++ b/src/BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h
@@ -0,0 +1,39 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_COLLISION_DISPATCHER_MT_H
+#define BT_COLLISION_DISPATCHER_MT_H
+
+#include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"
+#include "LinearMath/btThreads.h"
+
+
+class btCollisionDispatcherMt : public btCollisionDispatcher
+{
+public:
+    btCollisionDispatcherMt( btCollisionConfiguration* config, int grainSize = 40 );
+
+    virtual btPersistentManifold* getNewManifold( const btCollisionObject* body0, const btCollisionObject* body1 ) BT_OVERRIDE;
+    virtual void releaseManifold( btPersistentManifold* manifold ) BT_OVERRIDE;
+
+    virtual void dispatchAllCollisionPairs( btOverlappingPairCache* pairCache, const btDispatcherInfo& info, btDispatcher* dispatcher ) BT_OVERRIDE;
+
+protected:
+    bool m_batchUpdating;
+    int m_grainSize;
+};
+
+#endif //BT_COLLISION_DISPATCHER_MT_H
+
diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
index 5e51a994c..faa8193dd 100644
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
@@ -108,8 +108,105 @@ struct InplaceSolverIslandCallbackMt : public btSimulationIslandManagerMt::Islan
 };
 
 
+///
+/// btConstraintSolverPoolMt
+///
 
-btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,btBroadphaseInterface* pairCache,btConstraintSolver* constraintSolver, btCollisionConfiguration* collisionConfiguration)
+btConstraintSolverPoolMt::ThreadSolver* btConstraintSolverPoolMt::getAndLockThreadSolver()
+{
+    int i = btGetCurrentThreadIndex() % m_solvers.size();
+    while ( true )
+    {
+        ThreadSolver& solver = m_solvers[ i ];
+        if ( solver.mutex.tryLock() )
+        {
+            return &solver;
+        }
+        // failed, try the next one
+        i = ( i + 1 ) % m_solvers.size();
+    }
+    return NULL;
+}
+
+void btConstraintSolverPoolMt::init( btConstraintSolver** solvers, int numSolvers )
+{
+    m_solverType = BT_SEQUENTIAL_IMPULSE_SOLVER;
+    m_solvers.resize( numSolvers );
+    for ( int i = 0; i < numSolvers; ++i )
+    {
+        m_solvers[ i ].solver = solvers[ i ];
+    }
+    if ( numSolvers > 0 )
+    {
+        m_solverType = solvers[ 0 ]->getSolverType();
+    }
+}
+
+// create the solvers for me
+btConstraintSolverPoolMt::btConstraintSolverPoolMt( int numSolvers )
+{
+    btAlignedObjectArray<btConstraintSolver*> solvers;
+    solvers.reserve( numSolvers );
+    for ( int i = 0; i < numSolvers; ++i )
+    {
+        btConstraintSolver* solver = new btSequentialImpulseConstraintSolver();
+        solvers.push_back( solver );
+    }
+    init( &solvers[ 0 ], numSolvers );
+}
+
+// pass in fully constructed solvers (destructor will delete them)
+btConstraintSolverPoolMt::btConstraintSolverPoolMt( btConstraintSolver** solvers, int numSolvers )
+{
+    init( solvers, numSolvers );
+}
+
+btConstraintSolverPoolMt::~btConstraintSolverPoolMt()
+{
+    // delete all solvers
+    for ( int i = 0; i < m_solvers.size(); ++i )
+    {
+        ThreadSolver& solver = m_solvers[ i ];
+        delete solver.solver;
+        solver.solver = NULL;
+    }
+}
+
+///solve a group of constraints
+btScalar btConstraintSolverPoolMt::solveGroup( btCollisionObject** bodies,
+    int numBodies,
+    btPersistentManifold** manifolds,
+    int numManifolds,
+    btTypedConstraint** constraints,
+    int numConstraints,
+    const btContactSolverInfo& info,
+    btIDebugDraw* debugDrawer,
+    btDispatcher* dispatcher
+)
+{
+    ThreadSolver* ts = getAndLockThreadSolver();
+    ts->solver->solveGroup( bodies, numBodies, manifolds, numManifolds, constraints, numConstraints, info, debugDrawer, dispatcher );
+    ts->mutex.unlock();
+    return 0.0f;
+}
+
+void btConstraintSolverPoolMt::reset()
+{
+    for ( int i = 0; i < m_solvers.size(); ++i )
+    {
+        ThreadSolver& solver = m_solvers[ i ];
+        solver.mutex.lock();
+        solver.solver->reset();
+        solver.mutex.unlock();
+    }
+}
+
+
+///
+/// btDiscreteDynamicsWorldMt
+///
+
+btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher, btBroadphaseInterface* pairCache, btConstraintSolverPoolMt* constraintSolver, btCollisionConfiguration* collisionConfiguration)
 : btDiscreteDynamicsWorld(dispatcher,pairCache,constraintSolver,collisionConfiguration)
 {
 	if (m_ownsIslandManager)
@@ -124,8 +221,8 @@ btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,bt
 	{
 		void* mem = btAlignedAlloc(sizeof(btSimulationIslandManagerMt),16);
 		btSimulationIslandManagerMt* im = new (mem) btSimulationIslandManagerMt();
-        m_islandManager = im;
         im->setMinimumSolverBatchSize( m_solverInfo.m_minimumSolverBatchSize );
+        m_islandManager = im;
 	}
 }
 
@@ -145,7 +242,7 @@ btDiscreteDynamicsWorldMt::~btDiscreteDynamicsWorldMt()
 }
 
 
-void	btDiscreteDynamicsWorldMt::solveConstraints(btContactSolverInfo& solverInfo)
+void btDiscreteDynamicsWorldMt::solveConstraints(btContactSolverInfo& solverInfo)
 {
 	BT_PROFILE("solveConstraints");
 
@@ -160,3 +257,65 @@ void	btDiscreteDynamicsWorldMt::solveConstraints(btContactSolverInfo& solverInfo
 }
 
 
+struct UpdaterUnconstrainedMotion : public btIParallelForBody
+{
+    btScalar timeStep;
+    btRigidBody** rigidBodies;
+
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        for ( int i = iBegin; i < iEnd; ++i )
+        {
+            btRigidBody* body = rigidBodies[ i ];
+            if ( !body->isStaticOrKinematicObject() )
+            {
+                //don't integrate/update velocities here, it happens in the constraint solver
+                body->applyDamping( timeStep );
+                body->predictIntegratedTransform( timeStep, body->getInterpolationWorldTransform() );
+            }
+        }
+    }
+};
+
+
+void btDiscreteDynamicsWorldMt::predictUnconstraintMotion( btScalar timeStep )
+{
+    BT_PROFILE( "predictUnconstraintMotion" );
+    int grainSize = 50;  // num of iterations per task for TBB
+    int bodyCount = m_nonStaticRigidBodies.size();
+    UpdaterUnconstrainedMotion update;
+    update.timeStep = timeStep;
+    update.rigidBodies = bodyCount ? &m_nonStaticRigidBodies[ 0 ] : NULL;
+    btParallelFor( 0, bodyCount, grainSize, update );
+}
+
+
+void btDiscreteDynamicsWorldMt::createPredictiveContacts( btScalar timeStep )
+{
+    releasePredictiveContacts();
+    int grainSize = 50;  // num of iterations per task for TBB or OPENMP
+    if ( int bodyCount = m_nonStaticRigidBodies.size() )
+    {
+        UpdaterCreatePredictiveContacts update;
+        update.world = this;
+        update.timeStep = timeStep;
+        update.rigidBodies = &m_nonStaticRigidBodies[ 0 ];
+        btParallelFor( 0, bodyCount, grainSize, update );
+    }
+}
+
+
+void btDiscreteDynamicsWorldMt::integrateTransforms( btScalar timeStep )
+{
+    BT_PROFILE( "integrateTransforms" );
+    int grainSize = 50;  // num of iterations per task for TBB or OPENMP
+    if ( int bodyCount = m_nonStaticRigidBodies.size() )
+    {
+        UpdaterIntegrateTransforms update;
+        update.world = this;
+        update.timeStep = timeStep;
+        update.rigidBodies = &m_nonStaticRigidBodies[ 0 ];
+        btParallelFor( 0, bodyCount, grainSize, update );
+    }
+}
+
diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
index b28371b51..2f144cdda 100644
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
@@ -18,24 +18,116 @@ subject to the following restrictions:
 #define BT_DISCRETE_DYNAMICS_WORLD_MT_H
 
 #include "btDiscreteDynamicsWorld.h"
+#include "btSimulationIslandManagerMt.h"
+#include "BulletDynamics/ConstraintSolver/btConstraintSolver.h"
 
 struct InplaceSolverIslandCallbackMt;
 
+///
+/// btConstraintSolverPoolMt - masquerades as a constraint solver, but really it is a threadsafe pool of them.
+///
+///  Each solver in the pool is protected by a mutex.  When solveGroup is called from a thread,
+///  the pool looks for a solver that isn't being used by another thread, locks it, and dispatches the
+///  call to the solver.
+///  So long as there are at least as many solvers as there are hardware threads, it should never need to
+///  spin wait.
+///
+class btConstraintSolverPoolMt : public btConstraintSolver
+{
+public:
+    // create the solvers for me
+    explicit btConstraintSolverPoolMt( int numSolvers );
+
+    // pass in fully constructed solvers (destructor will delete them)
+    btConstraintSolverPoolMt( btConstraintSolver** solvers, int numSolvers );
+
+    virtual ~btConstraintSolverPoolMt();
+
+    ///solve a group of constraints
+    virtual btScalar solveGroup( btCollisionObject** bodies,
+        int numBodies,
+        btPersistentManifold** manifolds,
+        int numManifolds,
+        btTypedConstraint** constraints,
+        int numConstraints,
+        const btContactSolverInfo& info,
+        btIDebugDraw* debugDrawer,
+        btDispatcher* dispatcher
+    ) BT_OVERRIDE;
+
+    virtual	void reset() BT_OVERRIDE;
+    virtual btConstraintSolverType getSolverType() const BT_OVERRIDE { return m_solverType; }
+
+private:
+    const static size_t kCacheLineSize = 128;
+    struct ThreadSolver
+    {
+        btConstraintSolver* solver;
+        btSpinMutex mutex;
+        char _cachelinePadding[ kCacheLineSize - sizeof( btSpinMutex ) - sizeof( void* ) ];  // keep mutexes from sharing a cache line
+    };
+    btAlignedObjectArray<ThreadSolver> m_solvers;
+    btConstraintSolverType m_solverType;
+
+    ThreadSolver* getAndLockThreadSolver();
+    void init( btConstraintSolver** solvers, int numSolvers );
+};
+
+
+
 ///
 /// btDiscreteDynamicsWorldMt -- a version of DiscreteDynamicsWorld with some minor changes to support
 ///                              solving simulation islands on multiple threads.
 ///
+///  Should function exactly like btDiscreteDynamicsWorld.
+///  Also 3 methods that iterate over all of the rigidbodies can run in parallel:
+///     - predictUnconstraintMotion
+///     - integrateTransforms
+///     - createPredictiveContacts
+///
 ATTRIBUTE_ALIGNED16(class) btDiscreteDynamicsWorldMt : public btDiscreteDynamicsWorld
 {
 protected:
     InplaceSolverIslandCallbackMt* m_solverIslandCallbackMt;
 
-    virtual void	solveConstraints(btContactSolverInfo& solverInfo);
+    virtual void solveConstraints(btContactSolverInfo& solverInfo) BT_OVERRIDE;
+
+    virtual void predictUnconstraintMotion( btScalar timeStep ) BT_OVERRIDE;
+
+    struct UpdaterCreatePredictiveContacts : public btIParallelForBody
+    {
+        btScalar timeStep;
+        btRigidBody** rigidBodies;
+        btDiscreteDynamicsWorldMt* world;
+
+        void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+        {
+            world->createPredictiveContactsInternal( &rigidBodies[ iBegin ], iEnd - iBegin, timeStep );
+        }
+    };
+    virtual void createPredictiveContacts( btScalar timeStep ) BT_OVERRIDE;
+
+    struct UpdaterIntegrateTransforms : public btIParallelForBody
+    {
+        btScalar timeStep;
+        btRigidBody** rigidBodies;
+        btDiscreteDynamicsWorldMt* world;
+
+        void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+        {
+            world->integrateTransformsInternal( &rigidBodies[ iBegin ], iEnd - iBegin, timeStep );
+        }
+    };
+    virtual void integrateTransforms( btScalar timeStep ) BT_OVERRIDE;
 
 public:
 	BT_DECLARE_ALIGNED_ALLOCATOR();
 
-	btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,btBroadphaseInterface* pairCache,btConstraintSolver* constraintSolver,btCollisionConfiguration* collisionConfiguration);
+	btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,
+        btBroadphaseInterface* pairCache,
+        btConstraintSolverPoolMt* constraintSolver,   // Note this should be a solver-pool for multi-threading
+        btCollisionConfiguration* collisionConfiguration
+    );
 	virtual ~btDiscreteDynamicsWorldMt();
 };
 
diff --git a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
index ad63b6ee0..6373f60ff 100644
--- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
+++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
@@ -15,6 +15,7 @@ subject to the following restrictions:
 
 
 #include "LinearMath/btScalar.h"
+#include "LinearMath/btThreads.h"
 #include "btSimulationIslandManagerMt.h"
 #include "BulletCollision/BroadphaseCollision/btDispatcher.h"
 #include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"
@@ -44,7 +45,7 @@ btSimulationIslandManagerMt::btSimulationIslandManagerMt()
 {
     m_minimumSolverBatchSize = calcBatchCost(0, 128, 0);
     m_batchIslandMinBodyCount = 32;
-    m_islandDispatch = defaultIslandDispatch;
+    m_islandDispatch = parallelIslandDispatch;
     m_batchIsland = NULL;
 }
 
@@ -545,7 +546,7 @@ void btSimulationIslandManagerMt::mergeIslands()
 }
 
 
-void btSimulationIslandManagerMt::defaultIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback )
+void btSimulationIslandManagerMt::serialIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback )
 {
     // serial dispatch
     btAlignedObjectArray<Island*>& islands = *islandsPtr;
@@ -565,6 +566,40 @@ void btSimulationIslandManagerMt::defaultIslandDispatch( btAlignedObjectArray<Is
     }
 }
 
+struct UpdateIslandDispatcher : public btIParallelForBody
+{
+    btAlignedObjectArray<btSimulationIslandManagerMt::Island*>* islandsPtr;
+    btSimulationIslandManagerMt::IslandCallback* callback;
+
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        for ( int i = iBegin; i < iEnd; ++i )
+        {
+            btSimulationIslandManagerMt::Island* island = ( *islandsPtr )[ i ];
+            btPersistentManifold** manifolds = island->manifoldArray.size() ? &island->manifoldArray[ 0 ] : NULL;
+            btTypedConstraint** constraintsPtr = island->constraintArray.size() ? &island->constraintArray[ 0 ] : NULL;
+            callback->processIsland( &island->bodyArray[ 0 ],
+                island->bodyArray.size(),
+                manifolds,
+                island->manifoldArray.size(),
+                constraintsPtr,
+                island->constraintArray.size(),
+                island->id
+            );
+        }
+    }
+};
+
+void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback )
+{
+    int grainSize = 1;  // iterations per task
+    UpdateIslandDispatcher dispatcher;
+    dispatcher.islandsPtr = islandsPtr;
+    dispatcher.callback = callback;
+    btParallelFor( 0, islandsPtr->size(), grainSize, dispatcher );
+}
+
+
 ///@todo: this is random access, it can be walked 'cache friendly'!
 void btSimulationIslandManagerMt::buildAndProcessIslands( btDispatcher* dispatcher,
                                                         btCollisionWorld* collisionWorld,
diff --git a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
index 117061623..e3e0ea523 100644
--- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
+++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
@@ -59,7 +59,8 @@ public:
                                     ) = 0;
     };
     typedef void( *IslandDispatchFunc ) ( btAlignedObjectArray<Island*>* islands, IslandCallback* callback );
-    static void defaultIslandDispatch( btAlignedObjectArray<Island*>* islands, IslandCallback* callback );
+    static void serialIslandDispatch( btAlignedObjectArray<Island*>* islands, IslandCallback* callback );
+    static void parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback );
 protected:
     btAlignedObjectArray<Island*> m_allocatedIslands;  // owner of all Islands
     btAlignedObjectArray<Island*> m_activeIslands;  // islands actively in use
diff --git a/src/LinearMath/btThreads.cpp b/src/LinearMath/btThreads.cpp
index b72301a2e..3c73255a1 100644
--- a/src/LinearMath/btThreads.cpp
+++ b/src/LinearMath/btThreads.cpp
@@ -14,6 +14,247 @@ subject to the following restrictions:
 
 
 #include "btThreads.h"
+#include <algorithm>  // for min and max
+
+#if BT_THREADSAFE
+
+#if BT_USE_OPENMP
+
+#include <omp.h>
+
+#endif // #if BT_USE_OPENMP
+
+
+#if BT_USE_PPL
+
+// use Microsoft Parallel Patterns Library (installed with Visual Studio 2010 and later)
+#include <ppl.h>  // if you get a compile error here, check whether your version of Visual Studio includes PPL
+// Visual Studio 2010 and later should come with it
+#include <concrtrm.h>  // for GetProcessorCount()
+
+#endif // #if BT_USE_PPL
+
+
+#if BT_USE_TBB
+
+// use Intel Threading Building Blocks for thread management
+#define __TBB_NO_IMPLICIT_LINKAGE 1
+#include <tbb/tbb.h>
+#include <tbb/task_scheduler_init.h>
+#include <tbb/parallel_for.h>
+#include <tbb/blocked_range.h>
+
+#endif // #if BT_USE_TBB
+
+
+static btITaskScheduler* gBtTaskScheduler;
+static int gThreadsRunningCounter = 0;  // useful for detecting if we are trying to do nested parallel-for calls
+static btSpinMutex gThreadsRunningCounterMutex;
+
+void btPushThreadsAreRunning()
+{
+    gThreadsRunningCounterMutex.lock();
+    gThreadsRunningCounter++;
+    gThreadsRunningCounterMutex.unlock();
+}
+
+void btPopThreadsAreRunning()
+{
+    gThreadsRunningCounterMutex.lock();
+    gThreadsRunningCounter--;
+    gThreadsRunningCounterMutex.unlock();
+}
+
+bool btThreadsAreRunning()
+{
+    return gThreadsRunningCounter != 0;
+}
+
+
+void btSetTaskScheduler( btITaskScheduler* ts )
+{
+    gBtTaskScheduler = ts;
+}
+
+btITaskScheduler* btGetTaskScheduler()
+{
+    return gBtTaskScheduler;
+}
+
+void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body )
+{
+    gBtTaskScheduler->parallelFor( iBegin, iEnd, grainSize, body );
+}
+
+
+#if BT_USE_OPENMP
+///
+/// btTaskSchedulerOpenMP -- OpenMP task scheduler implementation
+///
+class btTaskSchedulerOpenMP : public btITaskScheduler
+{
+    int m_numThreads;
+public:
+    btTaskSchedulerOpenMP() : btITaskScheduler( "OpenMP" )
+    {
+        m_numThreads = 0;
+    }
+    virtual int getMaxNumThreads() const BT_OVERRIDE
+    {
+        return omp_get_max_threads();
+    }
+    virtual int getNumThreads() const BT_OVERRIDE
+    {
+        return m_numThreads;
+    }
+    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
+    {
+        m_numThreads = ( std::max )( 1, numThreads );
+        omp_set_num_threads( m_numThreads );
+    }
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
+    {
+        btPushThreadsAreRunning();
+#pragma omp parallel for schedule( static, 1 )
+        for ( int i = iBegin; i < iEnd; i += grainSize )
+        {
+            body.forLoop( i, ( std::min )( i + grainSize, iEnd ) );
+        }
+        btPopThreadsAreRunning();
+    }
+};
+#endif // #if BT_USE_OPENMP
+
+
+#if BT_USE_TBB
+///
+/// btTaskSchedulerTBB -- task scheduler implemented via Intel Threaded Building Blocks
+///
+class btTaskSchedulerTBB : public btITaskScheduler
+{
+    int m_numThreads;
+    tbb::task_scheduler_init* m_tbbSchedulerInit;
+
+public:
+    btTaskSchedulerTBB() : btITaskScheduler( "IntelTBB" )
+    {
+        m_numThreads = 0;
+        m_tbbSchedulerInit = NULL;
+    }
+    ~btTaskSchedulerTBB()
+    {
+        if ( m_tbbSchedulerInit )
+        {
+            delete m_tbbSchedulerInit;
+            m_tbbSchedulerInit = NULL;
+        }
+    }
+
+    virtual int getMaxNumThreads() const BT_OVERRIDE
+    {
+        return tbb::task_scheduler_init::default_num_threads();
+    }
+    virtual int getNumThreads() const BT_OVERRIDE
+    {
+        return m_numThreads;
+    }
+    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
+    {
+        m_numThreads = ( std::max )( 1, numThreads );
+        if ( m_tbbSchedulerInit )
+        {
+            delete m_tbbSchedulerInit;
+            m_tbbSchedulerInit = NULL;
+        }
+        m_tbbSchedulerInit = new tbb::task_scheduler_init( m_numThreads );
+    }
+    struct BodyAdapter
+    {
+        const btIParallelForBody* mBody;
+
+        void operator()( const tbb::blocked_range<int>& range ) const
+        {
+            mBody->forLoop( range.begin(), range.end() );
+        }
+    };
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
+    {
+        // TBB dispatch
+        BodyAdapter tbbBody;
+        tbbBody.mBody = &body;
+        btPushThreadsAreRunning();
+        tbb::parallel_for( tbb::blocked_range<int>( iBegin, iEnd, grainSize ),
+            tbbBody,
+            tbb::simple_partitioner()
+        );
+        btPopThreadsAreRunning();
+    }
+};
+#endif // #if BT_USE_TBB
+
+#if BT_USE_PPL
+///
+/// btTaskSchedulerPPL -- task scheduler implemented via Microsoft Parallel Patterns Lib
+///
+class btTaskSchedulerPPL : public btITaskScheduler
+{
+    int m_numThreads;
+public:
+    btTaskSchedulerPPL() : btITaskScheduler( "PPL" )
+    {
+        m_numThreads = 0;
+    }
+    virtual int getMaxNumThreads() const BT_OVERRIDE
+    {
+        return concurrency::GetProcessorCount();
+    }
+    virtual int getNumThreads() const BT_OVERRIDE
+    {
+        return m_numThreads;
+    }
+    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
+    {
+        m_numThreads = ( std::max )( 1, numThreads );
+        using namespace concurrency;
+        if ( CurrentScheduler::Id() != -1 )
+        {
+            CurrentScheduler::Detach();
+        }
+        SchedulerPolicy policy;
+        policy.SetConcurrencyLimits( m_numThreads, m_numThreads );
+        CurrentScheduler::Create( policy );
+    }
+    struct BodyAdapter
+    {
+        const btIParallelForBody* mBody;
+        int mGrainSize;
+        int mIndexEnd;
+
+        void operator()( int i ) const
+        {
+            mBody->forLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) );
+        }
+    };
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
+    {
+        // PPL dispatch
+        BodyAdapter pplBody;
+        pplBody.mBody = &body;
+        pplBody.mGrainSize = grainSize;
+        pplBody.mIndexEnd = iEnd;
+        btPushThreadsAreRunning();
+        // note: MSVC 2010 doesn't support partitioner args, so avoid them
+        concurrency::parallel_for( iBegin,
+            iEnd,
+            grainSize,
+            pplBody
+        );
+        btPopThreadsAreRunning();
+    }
+};
+#endif // #if BT_USE_PPL
+
+
 
 //
 // Lightweight spin-mutex based on atomics
@@ -22,8 +263,6 @@ subject to the following restrictions:
 // context switching.
 // 
 
-#if BT_THREADSAFE
-
 #if __cplusplus >= 201103L
 
 // for anything claiming full C++11 compliance, use C++11 atomics
@@ -229,3 +468,64 @@ bool btSpinMutex::tryLock()
 
 #endif // #if BT_THREADSAFE
 
+
+///
+/// btTaskSchedulerSequential -- non-threaded implementation of task scheduler
+///                              (fallback in case no multi-threaded schedulers are available)
+///
+class btTaskSchedulerSequential : public btITaskScheduler
+{
+public:
+    btTaskSchedulerSequential() : btITaskScheduler( "Sequential" ) {}
+    virtual int getMaxNumThreads() const BT_OVERRIDE { return 1; }
+    virtual int getNumThreads() const BT_OVERRIDE { return 1; }
+    virtual void setNumThreads( int numThreads ) BT_OVERRIDE {}
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
+    {
+        body.forLoop( iBegin, iEnd );
+    }
+};
+
+// create a non-threaded task scheduler (always available)
+btITaskScheduler* btGetSequentialTaskScheduler()
+{
+    static btTaskSchedulerSequential sTaskScheduler;
+    return &sTaskScheduler;
+}
+
+
+// create an OpenMP task scheduler (if available, otherwise returns null)
+btITaskScheduler* btGetOpenMPTaskScheduler()
+{
+#if BT_USE_OPENMP && BT_THREADSAFE
+    static btTaskSchedulerOpenMP sTaskScheduler;
+    return &sTaskScheduler;
+#else
+    return NULL;
+#endif
+}
+
+
+// create an Intel TBB task scheduler (if available, otherwise returns null)
+btITaskScheduler* btGetTBBTaskScheduler()
+{
+#if BT_USE_TBB && BT_THREADSAFE
+    static btTaskSchedulerTBB sTaskScheduler;
+    return &sTaskScheduler;
+#else
+    return NULL;
+#endif
+}
+
+
+// create a PPL task scheduler (if available, otherwise returns null)
+btITaskScheduler* btGetPPLTaskScheduler()
+{
+#if BT_USE_PPL && BT_THREADSAFE
+    static btTaskSchedulerPPL sTaskScheduler;
+    return &sTaskScheduler;
+#else
+    return NULL;
+#endif
+}
+
diff --git a/src/LinearMath/btThreads.h b/src/LinearMath/btThreads.h
index a15e5250b..b592ec3e5 100644
--- a/src/LinearMath/btThreads.h
+++ b/src/LinearMath/btThreads.h
@@ -19,6 +19,15 @@ subject to the following restrictions:
 
 #include "btScalar.h" // has definitions like SIMD_FORCE_INLINE
 
+#if defined (_MSC_VER) && _MSC_VER >= 1600
+// give us a compile error if any signatures of overriden methods is changed
+#define BT_OVERRIDE override
+#endif
+
+#ifndef BT_OVERRIDE
+#define BT_OVERRIDE
+#endif
+
 ///
 /// btSpinMutex -- lightweight spin-mutex implemented with atomic ops, never puts
 ///               a thread to sleep because it is designed to be used with a task scheduler
@@ -59,6 +68,7 @@ SIMD_FORCE_INLINE bool btMutexTryLock( btSpinMutex* mutex )
 
 // for internal use only
 bool btIsMainThread();
+bool btThreadsAreRunning();
 unsigned int btGetCurrentThreadIndex();
 const unsigned int BT_MAX_THREAD_COUNT = 64;
 
@@ -71,5 +81,55 @@ SIMD_FORCE_INLINE void btMutexUnlock( btSpinMutex* ) {}
 SIMD_FORCE_INLINE bool btMutexTryLock( btSpinMutex* ) {return true;}
 #endif
 
+//
+// btIParallelForBody -- subclass this to express work that can be done in parallel
+//
+class btIParallelForBody
+{
+public:
+    virtual void forLoop( int iBegin, int iEnd ) const = 0;
+};
+
+//
+// btITaskScheduler -- subclass this to implement a task scheduler that can dispatch work to
+//                     worker threads
+//
+class btITaskScheduler
+{
+    const char* m_name;
+public:
+    btITaskScheduler( const char* name ) : m_name( name ) {}
+    const char* getName() const { return m_name; }
+
+    virtual ~btITaskScheduler() {}
+    virtual int getMaxNumThreads() const = 0;
+    virtual int getNumThreads() const = 0;
+    virtual void setNumThreads( int numThreads ) = 0;
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) = 0;
+};
+
+// set the task scheduler to use for all calls to btParallelFor()
+// NOTE: you must set this prior to using any of the multi-threaded "Mt" classes
+void btSetTaskScheduler( btITaskScheduler* ts );
+
+// get the current task scheduler
+btITaskScheduler* btGetTaskScheduler();
+
+// get non-threaded task scheduler (always available)
+btITaskScheduler* btGetSequentialTaskScheduler();
+
+// get OpenMP task scheduler (if available, otherwise returns null)
+btITaskScheduler* btGetOpenMPTaskScheduler();
+
+// get Intel TBB task scheduler (if available, otherwise returns null)
+btITaskScheduler* btGetTBBTaskScheduler();
+
+// get PPL task scheduler (if available, otherwise returns null)
+btITaskScheduler* btGetPPLTaskScheduler();
+
+// btParallelFor -- call this to dispatch work like a for-loop
+//                 (iterations may be done out of order, so no dependencies are allowed)
+void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body );
+
 
 #endif //BT_THREADS_H