Merge pull request #1579 from lunkhound/parallel-solver-wip3

Multithreaded constraint solver
2018-03-26 17:11:05 -07:00
parent 79e7469a0a bdc3c2bafb
commit fafa939d33
28 changed files with 5520 additions and 908 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,14 +28,14 @@ OPTION(USE_GRAPHICAL_BENCHMARK "Use Graphical Benchmark" ON)
 OPTION(BUILD_SHARED_LIBS "Use shared libraries" OFF)
 OPTION(USE_SOFT_BODY_MULTI_BODY_DYNAMICS_WORLD "Use btSoftMultiBodyDynamicsWorld" ON)

-OPTION(BULLET2_USE_THREAD_LOCKS "Build Bullet 2 libraries with mutex locking around certain operations (required for multi-threading)" OFF)
-IF (BULLET2_USE_THREAD_LOCKS)
+OPTION(BULLET2_MULTITHREADING "Build Bullet 2 libraries with mutex locking around certain operations (required for multi-threading)" OFF)
+IF (BULLET2_MULTITHREADING)
    OPTION(BULLET2_USE_OPEN_MP_MULTITHREADING "Build Bullet 2 with support for multi-threading with OpenMP (requires a compiler with OpenMP support)" OFF)
    OPTION(BULLET2_USE_TBB_MULTITHREADING "Build Bullet 2 with support for multi-threading with Intel Threading Building Blocks (requires the TBB library to be already installed)" OFF)
    IF (MSVC)
        OPTION(BULLET2_USE_PPL_MULTITHREADING "Build Bullet 2 with support for multi-threading with Microsoft Parallel Patterns Library (requires MSVC compiler)" OFF)
    ENDIF (MSVC)
-ENDIF (BULLET2_USE_THREAD_LOCKS)
+ENDIF (BULLET2_MULTITHREADING)


 IF(NOT WIN32)
@@ -225,12 +225,15 @@ IF(USE_GRAPHICAL_BENCHMARK)
 ADD_DEFINITIONS( -DUSE_GRAPHICAL_BENCHMARK)
 ENDIF (USE_GRAPHICAL_BENCHMARK)

-IF(BULLET2_USE_THREAD_LOCKS)
+IF(BULLET2_MULTITHREADING)
 	ADD_DEFINITIONS( -DBT_THREADSAFE=1 )
 	IF (NOT MSVC)
 		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 	ENDIF (NOT MSVC)
-ENDIF (BULLET2_USE_THREAD_LOCKS)
+	IF (NOT WIN32)
+		LINK_LIBRARIES( pthread )
+	ENDIF (NOT WIN32)
+ENDIF (BULLET2_MULTITHREADING)

 IF (BULLET2_USE_OPEN_MP_MULTITHREADING)
    ADD_DEFINITIONS("-DBT_USE_OPENMP=1")
--- a/build3/premake4.lua
+++ b/build3/premake4.lua
@@ -182,6 +182,14 @@ end
 		trigger = "audio",
 		description = "Enable audio"
 	}
+	newoption
+	{
+		trigger = "enable_multithreading",
+		description = "enable CPU multithreading for bullet2 libs"
+	}
+	if _OPTIONS["enable_multithreading"] then
+		defines {"BT_THREADSAFE=1"}
+	end
 	if _OPTIONS["double"] then
 		defines {"BT_USE_DOUBLE_PRECISION"}
 	end
--- a/examples/ExampleBrowser/CMakeLists.txt
+++ b/examples/ExampleBrowser/CMakeLists.txt
@@ -226,7 +226,6 @@ SET(BulletExampleBrowser_SRCS
 	../MultiThreading/b3PosixThreadSupport.cpp
 	../MultiThreading/b3Win32ThreadSupport.cpp
 	../MultiThreading/b3ThreadSupportInterface.cpp
-	../MultiThreading/btTaskScheduler.cpp
 	../RenderingExamples/TinyRendererSetup.cpp
 	../RenderingExamples/TimeSeriesCanvas.cpp
 	../RenderingExamples/TimeSeriesCanvas.h
--- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
+++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
@@ -29,17 +29,17 @@ class btCollisionShape;
 #include "BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h"
 #include "BulletDynamics/Dynamics/btSimulationIslandManagerMt.h"  // for setSplitIslands()
 #include "BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h"
+#include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h"
 #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h"
 #include "BulletDynamics/ConstraintSolver/btNNCGConstraintSolver.h"
 #include "BulletDynamics/MLCPSolvers/btMLCPSolver.h"
 #include "BulletDynamics/MLCPSolvers/btSolveProjectedGaussSeidel.h"
 #include "BulletDynamics/MLCPSolvers/btDantzigSolver.h"
 #include "BulletDynamics/MLCPSolvers/btLemkeSolver.h"
-#include "../MultiThreading/btTaskScheduler.h"


 static int gNumIslands = 0;
-
+bool gAllowNestedParallelForLoops = false;

 class Profiler
 {
@@ -52,6 +52,10 @@ public:
        kRecordPredictUnconstrainedMotion,
        kRecordCreatePredictiveContacts,
        kRecordIntegrateTransforms,
+        kRecordSolverTotal,
+        kRecordSolverSetup,
+        kRecordSolverIterations,
+        kRecordSolverFinish,
        kRecordCount
    };

@@ -139,6 +143,41 @@ static void profileEndCallback( btDynamicsWorld *world, btScalar timeStep )
 }


+class MySequentialImpulseConstraintSolverMt : public btSequentialImpulseConstraintSolverMt
+{
+    typedef btSequentialImpulseConstraintSolverMt ParentClass;
+public:
+    BT_DECLARE_ALIGNED_ALLOCATOR();
+	
+	MySequentialImpulseConstraintSolverMt() {}
+
+    // for profiling
+	virtual btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE
+    {
+        ProfileHelper prof(Profiler::kRecordSolverSetup);
+        btScalar ret = ParentClass::solveGroupCacheFriendlySetup(bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer );
+        return ret;
+    }
+    virtual btScalar solveGroupCacheFriendlyIterations( btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& infoGlobal, btIDebugDraw* debugDrawer ) BT_OVERRIDE
+    {
+        ProfileHelper prof(Profiler::kRecordSolverIterations);
+        btScalar ret = ParentClass::solveGroupCacheFriendlyIterations(bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer );
+        return ret;
+    }
+    virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal) BT_OVERRIDE
+    {
+        ProfileHelper prof(Profiler::kRecordSolverFinish);
+        btScalar ret = ParentClass::solveGroupCacheFriendlyFinish(bodies, numBodies, infoGlobal);
+        return ret;
+    }
+    virtual btScalar solveGroup(btCollisionObject** bodies, int numBodies, btPersistentManifold** manifold, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btDispatcher* dispatcher) BT_OVERRIDE
+    {
+        ProfileHelper prof(Profiler::kRecordSolverTotal);
+        btScalar ret = ParentClass::solveGroup(bodies, numBodies, manifold, numManifolds, constraints, numConstraints, info, debugDrawer, dispatcher);
+        return ret;
+    }
+};
+
 ///
 /// MyCollisionDispatcher -- subclassed for profiling purposes
 ///
@@ -161,11 +200,11 @@ public:
 ///
 /// myParallelIslandDispatch -- wrap default parallel dispatch for profiling and to get the number of simulation islands
 //
-void myParallelIslandDispatch( btAlignedObjectArray<btSimulationIslandManagerMt::Island*>* islandsPtr, btSimulationIslandManagerMt::IslandCallback* callback )
+void myParallelIslandDispatch( btAlignedObjectArray<btSimulationIslandManagerMt::Island*>* islandsPtr, const btSimulationIslandManagerMt::SolverParams& solverParams)
 {
    ProfileHelper prof( Profiler::kRecordDispatchIslands );
    gNumIslands = islandsPtr->size();
-    btSimulationIslandManagerMt::parallelIslandDispatch( islandsPtr, callback );
+    btSimulationIslandManagerMt::parallelIslandDispatch( islandsPtr, solverParams );
 }


@@ -200,9 +239,10 @@ public:
    MyDiscreteDynamicsWorld( btDispatcher* dispatcher,
                             btBroadphaseInterface* pairCache,
                             btConstraintSolverPoolMt* constraintSolver,
+                             btSequentialImpulseConstraintSolverMt* constraintSolverMt,
                             btCollisionConfiguration* collisionConfiguration
                             ) :
-                             btDiscreteDynamicsWorldMt( dispatcher, pairCache, constraintSolver, collisionConfiguration )
+                             btDiscreteDynamicsWorldMt( dispatcher, pairCache, constraintSolver, constraintSolverMt, collisionConfiguration )
    {
        btSimulationIslandManagerMt* islandMgr = static_cast<btSimulationIslandManagerMt*>( m_islandManager );
        islandMgr->setIslandDispatchFunction( myParallelIslandDispatch );
@@ -218,6 +258,8 @@ btConstraintSolver* createSolverByType( SolverType t )
    {
    case SOLVER_TYPE_SEQUENTIAL_IMPULSE:
        return new btSequentialImpulseConstraintSolver();
+    case SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT:
+        return new MySequentialImpulseConstraintSolverMt();
    case SOLVER_TYPE_NNCG:
        return new btNNCGConstraintSolver();
    case SOLVER_TYPE_MLCP_PGS:
@@ -253,7 +295,7 @@ public:
    {
        addTaskScheduler( btGetSequentialTaskScheduler() );
 #if BT_THREADSAFE
-        if ( btITaskScheduler* ts = createDefaultTaskScheduler() )
+        if ( btITaskScheduler* ts = btCreateDefaultTaskScheduler() )
        {
            m_allocatedTaskSchedulers.push_back( ts );
            addTaskScheduler( ts );
@@ -306,11 +348,12 @@ static btTaskSchedulerManager gTaskSchedulerMgr;
 #if BT_THREADSAFE
 static bool gMultithreadedWorld = true;
 static bool gDisplayProfileInfo = true;
+static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT;
 #else
 static bool gMultithreadedWorld = false;
 static bool gDisplayProfileInfo = false;
-#endif
 static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE;
+#endif
 static int gSolverMode = SOLVER_SIMD |
                        SOLVER_USE_WARMSTARTING |
                        // SOLVER_RANDMIZE_ORDER |
@@ -318,9 +361,11 @@ static int gSolverMode = SOLVER_SIMD |
                        // SOLVER_USE_2_FRICTION_DIRECTIONS |
                        0;
 static btScalar gSliderSolverIterations = 10.0f; // should be int
-
 static btScalar gSliderNumThreads = 1.0f;  // should be int
-
+static btScalar gSliderIslandBatchingThreshold = 0.0f; // should be int
+static btScalar gSliderMinBatchSize = btScalar(btSequentialImpulseConstraintSolverMt::s_minBatchSize); // should be int
+static btScalar gSliderMaxBatchSize = btScalar(btSequentialImpulseConstraintSolverMt::s_maxBatchSize); // should be int
+static btScalar gSliderLeastSquaresResidualThreshold = 0.0f;

 ////////////////////////////////////
 CommonRigidBodyMTBase::CommonRigidBodyMTBase( struct GUIHelperInterface* helper )
@@ -419,6 +464,23 @@ void setTaskSchedulerComboBoxCallback(int combobox, const char* item, void* user
 }


+void setBatchingMethodComboBoxCallback(int combobox, const char* item, void* userPointer)
+{
+#if BT_THREADSAFE
+    const char** items = static_cast<const char**>( userPointer );
+    for ( int i = 0; i < btBatchedConstraints::BATCHING_METHOD_COUNT; ++i )
+    {
+        if ( strcmp( item, items[ i ] ) == 0 )
+        {
+            // change the task scheduler
+            btSequentialImpulseConstraintSolverMt::s_contactBatchingMethod = static_cast<btBatchedConstraints::BatchingMethod>( i );
+            break;
+        }
+    }
+#endif // #if BT_THREADSAFE
+}
+
+
 static void setThreadCountCallback(float val, void* userPtr)
 {
 #if BT_THREADSAFE
@@ -435,13 +497,43 @@ static void setSolverIterationCountCallback(float val, void* userPtr)
    }
 }

+static void setLargeIslandManifoldCountCallback( float val, void* userPtr )
+{
+    btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching = int( gSliderIslandBatchingThreshold );
+}
+
+static void setMinBatchSizeCallback( float val, void* userPtr )
+{
+    gSliderMaxBatchSize = (std::max)(gSliderMinBatchSize, gSliderMaxBatchSize);
+    btSequentialImpulseConstraintSolverMt::s_minBatchSize = int(gSliderMinBatchSize);
+    btSequentialImpulseConstraintSolverMt::s_maxBatchSize = int(gSliderMaxBatchSize);
+}
+
+static void setMaxBatchSizeCallback( float val, void* userPtr )
+{
+    gSliderMinBatchSize = (std::min)(gSliderMinBatchSize, gSliderMaxBatchSize);
+    btSequentialImpulseConstraintSolverMt::s_minBatchSize = int(gSliderMinBatchSize);
+    btSequentialImpulseConstraintSolverMt::s_maxBatchSize = int(gSliderMaxBatchSize);
+}
+
+static void setLeastSquaresResidualThresholdCallback( float val, void* userPtr )
+{
+    if (btDiscreteDynamicsWorld* world = reinterpret_cast<btDiscreteDynamicsWorld*>(userPtr))
+    {
+        world->getSolverInfo().m_leastSquaresResidualThreshold = gSliderLeastSquaresResidualThreshold;
+    }
+}
+
 void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
 {
    gNumIslands = 0;
    m_solverType = gSolverType;
-#if BT_THREADSAFE && (BT_USE_OPENMP || BT_USE_PPL || BT_USE_TBB)
+#if BT_THREADSAFE
    btAssert( btGetTaskScheduler() != NULL );
+    if (NULL != btGetTaskScheduler() && gTaskSchedulerMgr.getNumTaskSchedulers() > 1)
+    {
        m_multithreadCapable = true;
+    }
 #endif
    if ( gMultithreadedWorld )
    {
@@ -457,16 +549,28 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()

        btConstraintSolverPoolMt* solverPool;
        {
+            SolverType poolSolverType = m_solverType;
+            if (poolSolverType == SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT)
+            {
+                // pool solvers shouldn't be parallel solvers, we don't allow that kind of
+                // nested parallelism because of performance issues
+                poolSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE;
+            }
            btConstraintSolver* solvers[ BT_MAX_THREAD_COUNT ];
            int maxThreadCount = BT_MAX_THREAD_COUNT;
            for ( int i = 0; i < maxThreadCount; ++i )
            {
-                solvers[ i ] = createSolverByType( m_solverType );
+                solvers[ i ] = createSolverByType( poolSolverType );
            }
            solverPool = new btConstraintSolverPoolMt( solvers, maxThreadCount );
            m_solver = solverPool;
        }
-        btDiscreteDynamicsWorld* world = new MyDiscreteDynamicsWorld( m_dispatcher, m_broadphase, solverPool, m_collisionConfiguration );
+        btSequentialImpulseConstraintSolverMt* solverMt = NULL;
+        if ( m_solverType == SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT )
+        {
+            solverMt = new MySequentialImpulseConstraintSolverMt();
+        }
+        btDiscreteDynamicsWorld* world = new MyDiscreteDynamicsWorld( m_dispatcher, m_broadphase, solverPool, solverMt, m_collisionConfiguration );
        m_dynamicsWorld = world;
        m_multithreadedWorld = true;
        btAssert( btGetTaskScheduler() != NULL );
@@ -486,7 +590,14 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()

        m_broadphase = new btDbvtBroadphase();

-        m_solver = createSolverByType( m_solverType );
+        SolverType solverType = m_solverType;
+        if ( solverType == SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT )
+        {
+            // using the parallel solver with the single-threaded world works, but is
+            // disabled here to avoid confusion
+            solverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE;
+        }
+        m_solver = createSolverByType( solverType );

        m_dynamicsWorld = new btDiscreteDynamicsWorld( m_dispatcher, m_broadphase, m_solver, m_collisionConfiguration );
    }
@@ -494,6 +605,7 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
    m_dynamicsWorld->setInternalTickCallback( profileEndCallback, NULL, false );
    m_dynamicsWorld->setGravity( btVector3( 0, -10, 0 ) );
    m_dynamicsWorld->getSolverInfo().m_solverMode = gSolverMode;
+   	m_dynamicsWorld->getSolverInfo().m_numIterations = btMax(1, int(gSliderSolverIterations));
    createDefaultParameters();
 }

@@ -504,16 +616,18 @@ void CommonRigidBodyMTBase::createDefaultParameters()
    {
        // create a button to toggle multithreaded world
        ButtonParams button( "Multithreaded world enable", 0, true );
-        button.m_initialState = gMultithreadedWorld;
-        button.m_userPointer = &gMultithreadedWorld;
+        bool* ptr = &gMultithreadedWorld;
+        button.m_initialState = *ptr;
+        button.m_userPointer = ptr;
        button.m_callback = boolPtrButtonCallback;
        m_guiHelper->getParameterInterface()->registerButtonParameter( button );
    }
    {
        // create a button to toggle profile printing
        ButtonParams button( "Display solver info", 0, true );
-        button.m_initialState = gDisplayProfileInfo;
-        button.m_userPointer = &gDisplayProfileInfo;
+        bool* ptr = &gDisplayProfileInfo;
+        button.m_initialState = *ptr;
+        button.m_userPointer = ptr;
        button.m_callback = boolPtrButtonCallback;
        m_guiHelper->getParameterInterface()->registerButtonParameter( button );
    }
@@ -544,6 +658,16 @@ void CommonRigidBodyMTBase::createDefaultParameters()
        slider.m_clampToIntegers = true;
        m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
    }
+    {
+        // a slider for the solver leastSquaresResidualThreshold (used to run fewer solver iterations when convergence is good)
+        SliderParams slider( "Solver residual thresh", &gSliderLeastSquaresResidualThreshold );
+        slider.m_minVal = 0.0f;
+        slider.m_maxVal = 0.25f;
+        slider.m_callback = setLeastSquaresResidualThresholdCallback;
+        slider.m_userPointer = m_dynamicsWorld;
+        slider.m_clampToIntegers = false;
+        m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
+    }
    {
        ButtonParams button( "Solver use SIMD", 0, true );
        button.m_buttonId = SOLVER_SIMD;
@@ -618,20 +742,86 @@ void CommonRigidBodyMTBase::createDefaultParameters()
            m_guiHelper->getParameterInterface()->registerComboBox( comboParams );
        }
        {
-            // create a slider to set the number of threads to use
-            int numThreads = btGetTaskScheduler()->getNumThreads();
            // if slider has not been set yet (by another demo),
            if ( gSliderNumThreads <= 1.0f )
            {
+                // create a slider to set the number of threads to use
+                int numThreads = btGetTaskScheduler()->getNumThreads();
                gSliderNumThreads = float( numThreads );
            }
+            int maxNumThreads = btGetTaskScheduler()->getMaxNumThreads();
 			SliderParams slider("Thread count", &gSliderNumThreads);
 			slider.m_minVal = 1.0f;
-			slider.m_maxVal = float( BT_MAX_THREAD_COUNT );
+			slider.m_maxVal = float( maxNumThreads );
 			slider.m_callback = setThreadCountCallback;
            slider.m_clampToIntegers = true;
            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
        }
+        {
+            // a slider for the number of manifolds an island needs to be too large for parallel dispatch
+            if (gSliderIslandBatchingThreshold < 1.0)
+            {
+                gSliderIslandBatchingThreshold = float(btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching);
+            }
+            SliderParams slider( "IslandBatchThresh", &gSliderIslandBatchingThreshold );
+            slider.m_minVal = 1.0f;
+            slider.m_maxVal = 2000.0f;
+            slider.m_callback = setLargeIslandManifoldCountCallback;
+            slider.m_userPointer = NULL;
+            slider.m_clampToIntegers = true;
+            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
+        }
+        {
+            // create a combo box for selecting the batching method
+            static const char* sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_COUNT ];
+            {
+                sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_SPATIAL_GRID_2D ] = "Batching: 2D Grid";
+                sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_SPATIAL_GRID_3D ] = "Batching: 3D Grid";
+            };
+            ComboBoxParams comboParams;
+            comboParams.m_userPointer = sBatchingMethodComboBoxItems;
+            comboParams.m_numItems = btBatchedConstraints::BATCHING_METHOD_COUNT;
+            comboParams.m_startItem = static_cast<int>(btSequentialImpulseConstraintSolverMt::s_contactBatchingMethod);
+            comboParams.m_items = sBatchingMethodComboBoxItems;
+            comboParams.m_callback = setBatchingMethodComboBoxCallback;
+            m_guiHelper->getParameterInterface()->registerComboBox( comboParams );
+        }
+        {
+            // a slider for the sequentialImpulseConstraintSolverMt min batch size (when batching)
+            SliderParams slider( "Min batch size", &gSliderMinBatchSize );
+            slider.m_minVal = 1.0f;
+            slider.m_maxVal = 1000.0f;
+            slider.m_callback = setMinBatchSizeCallback;
+            slider.m_userPointer = NULL;
+            slider.m_clampToIntegers = true;
+            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
+        }
+        {
+            // a slider for the sequentialImpulseConstraintSolverMt max batch size (when batching)
+            SliderParams slider( "Max batch size", &gSliderMaxBatchSize );
+            slider.m_minVal = 1.0f;
+            slider.m_maxVal = 1000.0f;
+            slider.m_callback = setMaxBatchSizeCallback;
+            slider.m_userPointer = NULL;
+            slider.m_clampToIntegers = true;
+            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
+        }
+        {
+            // create a button to toggle debug drawing of batching visualization
+            ButtonParams button( "Visualize batching", 0, true );
+            bool* ptr = &btBatchedConstraints::s_debugDrawBatches;
+            button.m_initialState = *ptr;
+            button.m_userPointer = ptr;
+            button.m_callback = boolPtrButtonCallback;
+            m_guiHelper->getParameterInterface()->registerButtonParameter( button );
+        }
+        {
+            ButtonParams button( "Allow Nested ParallelFor", 0, true );
+            button.m_initialState = btSequentialImpulseConstraintSolverMt::s_allowNestedParallelForLoops;
+            button.m_userPointer = &btSequentialImpulseConstraintSolverMt::s_allowNestedParallelForLoops;
+            button.m_callback = boolPtrButtonCallback;
+            m_guiHelper->getParameterInterface()->registerButtonParameter( button );
+        }
 #endif // #if BT_THREADSAFE
    }
 }
@@ -643,6 +833,7 @@ void CommonRigidBodyMTBase::drawScreenText()
    int xCoord = 400;
    int yCoord = 30;
    int yStep = 30;
+    int indent = 30;
    if (m_solverType != gSolverType)
    {
        sprintf( msg, "restart example to change solver type" );
@@ -721,6 +912,34 @@ void CommonRigidBodyMTBase::drawScreenText()
            m_guiHelper->getAppInterface()->drawText( msg, xCoord, yCoord, 0.4f );
            yCoord += yStep;

+            sprintf( msg,
+                     "SolverTotal %5.3f ms",
+                     gProfiler.getAverageTime( Profiler::kRecordSolverTotal )*0.001f
+                     );
+            m_guiHelper->getAppInterface()->drawText( msg, xCoord, yCoord, 0.4f );
+            yCoord += yStep;
+
+            sprintf( msg,
+                     "SolverSetup %5.3f ms",
+                     gProfiler.getAverageTime( Profiler::kRecordSolverSetup )*0.001f
+                     );
+            m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f );
+            yCoord += yStep;
+
+            sprintf( msg,
+                     "SolverIterations %5.3f ms",
+                     gProfiler.getAverageTime( Profiler::kRecordSolverIterations )*0.001f
+                     );
+            m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f );
+            yCoord += yStep;
+
+            sprintf( msg,
+                     "SolverFinish %5.3f ms",
+                     gProfiler.getAverageTime( Profiler::kRecordSolverFinish )*0.001f
+                     );
+            m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f );
+            yCoord += yStep;
+
            sprintf( msg,
                     "PredictUnconstrainedMotion %5.3f ms",
                     gProfiler.getAverageTime( Profiler::kRecordPredictUnconstrainedMotion )*0.001f
--- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.h
+++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.h
@@ -14,6 +14,7 @@
 enum SolverType
 {
    SOLVER_TYPE_SEQUENTIAL_IMPULSE,
+    SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT,
    SOLVER_TYPE_NNCG,
    SOLVER_TYPE_MLCP_PGS,
    SOLVER_TYPE_MLCP_DANTZIG,
@@ -27,6 +28,7 @@ inline const char* getSolverTypeName( SolverType t )
    switch (t)
    {
    case SOLVER_TYPE_SEQUENTIAL_IMPULSE: return "SequentialImpulse";
+    case SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT: return "SequentialImpulseMt";
    case SOLVER_TYPE_NNCG: return "NNCG";
    case SOLVER_TYPE_MLCP_PGS: return "MLCP ProjectedGaussSeidel";
    case SOLVER_TYPE_MLCP_DANTZIG: return "MLCP Dantzig";
--- a/examples/MultiThreadedDemo/MultiThreadedDemo.cpp
+++ b/examples/MultiThreadedDemo/MultiThreadedDemo.cpp
@@ -25,10 +25,10 @@ subject to the following restrictions:



-static btScalar gSliderStackRows = 8.0f;
-static btScalar gSliderStackColumns = 6.0f;
-static btScalar gSliderStackHeight = 10.0f;
-static btScalar gSliderStackWidth = 1.0f;
+static btScalar gSliderStackRows = 1.0f;
+static btScalar gSliderStackColumns = 1.0f;
+static btScalar gSliderStackHeight = 15.0f;
+static btScalar gSliderStackWidth = 8.0f;
 static btScalar gSliderGroundHorizontalAmplitude = 0.0f;
 static btScalar gSliderGroundVerticalAmplitude = 0.0f;
 static btScalar gSliderGroundTilt = 0.0f;
@@ -75,6 +75,21 @@ public:
        btScalar tilt = gSliderGroundTilt * SIMD_2_PI / 360.0f;
        return btQuaternion( btVector3( 1.0f, 0.0f, 0.0f ), tilt );
    }
+    struct TestSumBody : public btIParallelSumBody
+    {
+        virtual btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+        {
+            btScalar sum = 0.0f;
+            for (int i = iBegin; i < iEnd; ++i)
+            {
+                if (i > 0)
+                {
+                    sum += 1.0f / btScalar(i);
+                }
+            }
+            return sum;
+        }
+    };
    virtual void stepSimulation( float deltaTime ) BT_OVERRIDE
    {
        if ( m_dynamicsWorld )
@@ -115,6 +130,14 @@ public:
            // always step by 1/60 for benchmarking
            m_dynamicsWorld->stepSimulation( 1.0f / 60.0f, 0 );
        }
+#if 0
+        {
+            // test parallelSum
+            TestSumBody testSumBody;
+            float testSum = btParallelSum( 1, 10000000, 10000, testSumBody );
+            printf( "sum = %f\n", testSum );
+        }
+#endif
    }

    virtual void initPhysics() BT_OVERRIDE;
--- a/examples/MultiThreading/btTaskScheduler.cpp
+++ b/examples/MultiThreading/btTaskScheduler.cpp
@@ -1,448 +0,0 @@
-
-#include "LinearMath/btTransform.h"
-#include "../Utils/b3Clock.h"
-#include "LinearMath/btAlignedObjectArray.h"
-#include "LinearMath/btThreads.h"
-#include "LinearMath/btQuickprof.h"
-#include <stdio.h>
-#include <algorithm>
-
-
-typedef void( *btThreadFunc )( void* userPtr, void* lsMemory );
-typedef void* ( *btThreadLocalStorageFunc )();
-
-#if BT_THREADSAFE
-
-#if defined( _WIN32 )
-
-#include "b3Win32ThreadSupport.h"
-
-b3ThreadSupportInterface* createThreadSupport( int numThreads, btThreadFunc threadFunc, btThreadLocalStorageFunc localStoreFunc, const char* uniqueName )
-{
-    b3Win32ThreadSupport::Win32ThreadConstructionInfo constructionInfo( uniqueName, threadFunc, localStoreFunc, numThreads );
-    //constructionInfo.m_priority = 0;  // highest priority (the default) -- can cause erratic performance when numThreads > numCores
-    //                                     we don't want worker threads to be higher priority than the main thread or the main thread could get
-    //                                     totally shut out and unable to tell the workers to stop
-    constructionInfo.m_priority = -1;  // normal priority
-    b3Win32ThreadSupport* threadSupport = new b3Win32ThreadSupport( constructionInfo );
-    return threadSupport;
-}
-
-#else // #if defined( _WIN32 )
-
-#include "b3PosixThreadSupport.h"
-
-b3ThreadSupportInterface* createThreadSupport( int numThreads, btThreadFunc threadFunc, btThreadLocalStorageFunc localStoreFunc, const char* uniqueName)
-{
-    b3PosixThreadSupport::ThreadConstructionInfo constructionInfo( uniqueName, threadFunc, localStoreFunc, numThreads );
-    b3ThreadSupportInterface* threadSupport = new b3PosixThreadSupport( constructionInfo );
-    return threadSupport;
-}
-
-#endif // #else // #if defined( _WIN32 )
-
-
-///
-/// getNumHardwareThreads()
-///
-///
-/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
-///
-#if __cplusplus >= 201103L
-
-#include <thread>
-
-int getNumHardwareThreads()
-{
-    return std::thread::hardware_concurrency();
-}
-
-#elif defined( _WIN32 )
-
-#define WIN32_LEAN_AND_MEAN
-
-#include <windows.h>
-
-int getNumHardwareThreads()
-{
-    // caps out at 32
-    SYSTEM_INFO info;
-    GetSystemInfo( &info );
-    return info.dwNumberOfProcessors;
-}
-
-#else
-
-int getNumHardwareThreads()
-{
-    return 0;  // don't know
-}
-
-#endif
-
-
-struct WorkerThreadStatus
-{
-    enum Type
-    {
-        kInvalid,
-        kWaitingForWork,
-        kWorking,
-        kSleeping,
-    };
-};
-
-
-struct IJob
-{
-    virtual void executeJob() = 0;
-};
-
-class ParallelForJob : public IJob
-{
-    const btIParallelForBody* mBody;
-    int mBegin;
-    int mEnd;
-
-public:
-    ParallelForJob()
-    {
-        mBody = NULL;
-        mBegin = 0;
-        mEnd = 0;
-    }
-    void init( int iBegin, int iEnd, const btIParallelForBody& body )
-    {
-        mBody = &body;
-        mBegin = iBegin;
-        mEnd = iEnd;
-    }
-    virtual void executeJob() BT_OVERRIDE
-    {
-        BT_PROFILE( "executeJob" );
-
-        // call the functor body to do the work
-        mBody->forLoop( mBegin, mEnd );
-    }
-};
-
-
-struct JobContext
-{
-    JobContext()
-    {
-        m_queueLock = NULL;
-        m_headIndex = 0;
-        m_tailIndex = 0;
-        m_workersShouldCheckQueue = false;
-        m_useSpinMutex = false;
-    }
-    b3CriticalSection* m_queueLock;
-    btSpinMutex m_mutex;
-    volatile bool m_workersShouldCheckQueue;
-
-    btAlignedObjectArray<IJob*> m_jobQueue;
-    bool m_queueIsEmpty;
-    int m_tailIndex;
-    int m_headIndex;
-    bool m_useSpinMutex;
-
-    void lockQueue()
-    {
-        if ( m_useSpinMutex )
-        {
-            m_mutex.lock();
-        }
-        else
-        {
-            m_queueLock->lock();
-        }
-    }
-    void unlockQueue()
-    {
-        if ( m_useSpinMutex )
-        {
-            m_mutex.unlock();
-        }
-        else
-        {
-            m_queueLock->unlock();
-        }
-    }
-    void clearQueue()
-    {
-        lockQueue();
-        m_headIndex = 0;
-        m_tailIndex = 0;
-        m_queueIsEmpty = true;
-        unlockQueue();
-        m_jobQueue.resizeNoInitialize( 0 );
-    }
-    void submitJob( IJob* job )
-    {
-        m_jobQueue.push_back( job );
-        lockQueue();
-        m_tailIndex++;
-        m_queueIsEmpty = false;
-        unlockQueue();
-    }
-    IJob* consumeJob()
-    {
-        if ( m_queueIsEmpty )
-        {
-            // lock free path. even if this is taken erroneously it isn't harmful
-            return NULL;
-        }
-        IJob* job = NULL;
-        lockQueue();
-        if ( !m_queueIsEmpty )
-        {
-            job = m_jobQueue[ m_headIndex++ ];
-            if ( m_headIndex == m_tailIndex )
-            {
-                m_queueIsEmpty = true;
-            }
-        }
-        unlockQueue();
-        return job;
-    }
-};
-
-
-struct WorkerThreadLocalStorage
-{
-    int threadId;
-    WorkerThreadStatus::Type status;
-};
-
-
-static void WorkerThreadFunc( void* userPtr, void* lsMemory )
-{
-    BT_PROFILE( "WorkerThreadFunc" );
-    WorkerThreadLocalStorage* localStorage = (WorkerThreadLocalStorage*) lsMemory;
-    localStorage->status = WorkerThreadStatus::kWaitingForWork;
-    //printf( "WorkerThreadFunc: worker %d start working\n", localStorage->threadId );
-
-    JobContext* jobContext = (JobContext*) userPtr;
-
-    while ( jobContext->m_workersShouldCheckQueue )
-    {
-        if ( IJob* job = jobContext->consumeJob() )
-        {
-            localStorage->status = WorkerThreadStatus::kWorking;
-            job->executeJob();
-            localStorage->status = WorkerThreadStatus::kWaitingForWork;
-        }
-        else
-        {
-            // todo: spin wait a bit to avoid hammering the empty queue
-        }
-    }
-
-    //printf( "WorkerThreadFunc stop working\n" );
-    localStorage->status = WorkerThreadStatus::kSleeping;
-    // go idle
-}
-
-
-static void* WorkerThreadAllocFunc()
-{
-    return new WorkerThreadLocalStorage;
-}
-
-
-
-class btTaskSchedulerDefault : public btITaskScheduler
-{
-    JobContext m_jobContext;
-    b3ThreadSupportInterface* m_threadSupport;
-    btAlignedObjectArray<ParallelForJob> m_jobs;
-    btSpinMutex m_antiNestingLock;  // prevent nested parallel-for
-    int m_numThreads;
-    int m_numWorkerThreads;
-    int m_numWorkersRunning;
-public:
-
-    btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport")
-    {
-        m_threadSupport = NULL;
-        m_numThreads = getNumHardwareThreads();
-        // if can't detect number of cores,
-        if ( m_numThreads == 0 )
-        {
-            // take a guess
-            m_numThreads = 4;
-        }
-        m_numWorkerThreads = m_numThreads - 1;
-        m_numWorkersRunning = 0;
-    }
-
-    virtual ~btTaskSchedulerDefault()
-    {
-        shutdown();
-    }
-
-    void init()
-    {
-        int maxNumWorkerThreads = BT_MAX_THREAD_COUNT - 1;
-        m_threadSupport = createThreadSupport( maxNumWorkerThreads, WorkerThreadFunc, WorkerThreadAllocFunc, "TaskScheduler" );
-        m_jobContext.m_queueLock = m_threadSupport->createCriticalSection();
-        for ( int i = 0; i < maxNumWorkerThreads; i++ )
-        {
-            WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i );
-            btAssert( storage );
-            storage->threadId = i;
-            storage->status = WorkerThreadStatus::kSleeping;
-        }
-        setWorkersActive( false ); // no work for them yet
-    }
-
-    virtual void shutdown()
-    {
-        setWorkersActive( false );
-        waitForWorkersToSleep();
-        m_threadSupport->deleteCriticalSection( m_jobContext.m_queueLock );
-        m_jobContext.m_queueLock = NULL;
-
-        delete m_threadSupport;
-        m_threadSupport = NULL;
-    }
-
-    void setWorkersActive( bool active )
-    {
-        m_jobContext.m_workersShouldCheckQueue = active;
-    }
-
-    virtual int getMaxNumThreads() const BT_OVERRIDE
-    {
-        return BT_MAX_THREAD_COUNT;
-    }
-
-    virtual int getNumThreads() const BT_OVERRIDE
-    {
-        return m_numThreads;
-    }
-
-    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
-    {
-        m_numThreads = btMax( btMin(numThreads, int(BT_MAX_THREAD_COUNT)), 1 );
-        m_numWorkerThreads = m_numThreads - 1;
-    }
-
-    void waitJobs()
-    {
-        BT_PROFILE( "waitJobs" );
-        // have the main thread work until the job queue is empty
-        for ( ;; )
-        {
-            if ( IJob* job = m_jobContext.consumeJob() )
-            {
-                job->executeJob();
-            }
-            else
-            {
-                break;
-            }
-        }
-        // done with jobs for now, tell workers to rest
-        setWorkersActive( false );
-        waitForWorkersToSleep();
-    }
-
-    void wakeWorkers()
-    {
-        BT_PROFILE( "wakeWorkers" );
-        btAssert( m_jobContext.m_workersShouldCheckQueue );
-        // tell each worker thread to start working
-        for ( int i = 0; i < m_numWorkerThreads; i++ )
-        {
-            m_threadSupport->runTask( B3_THREAD_SCHEDULE_TASK, &m_jobContext, i );
-            m_numWorkersRunning++;
-        }
-    }
-
-    void waitForWorkersToSleep()
-    {
-        BT_PROFILE( "waitForWorkersToSleep" );
-        while ( m_numWorkersRunning > 0 )
-        {
-            int iThread;
-            int threadStatus;
-            m_threadSupport->waitForResponse( &iThread, &threadStatus );  // wait for worker threads to finish working
-            m_numWorkersRunning--;
-        }
-        //m_threadSupport->waitForAllTasksToComplete();
-        for ( int i = 0; i < m_numWorkerThreads; i++ )
-        {
-            //m_threadSupport->waitForTaskCompleted( i );
-            WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i );
-            btAssert( storage );
-            btAssert( storage->status == WorkerThreadStatus::kSleeping );
-        }
-    }
-
-    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
-    {
-        BT_PROFILE( "parallelFor_ThreadSupport" );
-        btAssert( iEnd >= iBegin );
-        btAssert( grainSize >= 1 );
-        int iterationCount = iEnd - iBegin;
-        if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() )
-        {
-            int jobCount = ( iterationCount + grainSize - 1 ) / grainSize;
-            btAssert( jobCount >= 2 );  // need more than one job for multithreading
-            if ( jobCount > m_jobs.size() )
-            {
-                m_jobs.resize( jobCount );
-            }
-            if ( jobCount > m_jobContext.m_jobQueue.capacity() )
-            {
-                m_jobContext.m_jobQueue.reserve( jobCount );
-            }
-
-            m_jobContext.clearQueue();
-            // prepare worker threads for incoming work
-            setWorkersActive( true );
-            wakeWorkers();
-            // submit all of the jobs
-            int iJob = 0;
-            for ( int i = iBegin; i < iEnd; i += grainSize )
-            {
-                btAssert( iJob < jobCount );
-                int iE = btMin( i + grainSize, iEnd );
-                ParallelForJob& job = m_jobs[ iJob ];
-                job.init( i, iE, body );
-                m_jobContext.submitJob( &job );
-                iJob++;
-            }
-
-            // put the main thread to work on emptying the job queue and then wait for all workers to finish
-            waitJobs();
-            m_antiNestingLock.unlock();
-        }
-        else
-        {
-            BT_PROFILE( "parallelFor_mainThread" );
-            // just run on main thread
-            body.forLoop( iBegin, iEnd );
-        }
-    }
-};
-
-
-
-btITaskScheduler* createDefaultTaskScheduler()
-{
-    btTaskSchedulerDefault* ts = new btTaskSchedulerDefault();
-    ts->init();
-    return ts;
-}
-
-#else // #if BT_THREADSAFE
-
-btITaskScheduler* createDefaultTaskScheduler()
-{
-    return NULL;
-}
-
-#endif // #else // #if BT_THREADSAFE
--- a/examples/MultiThreading/btTaskScheduler.h
+++ b/examples/MultiThreading/btTaskScheduler.h
@@ -1,26 +0,0 @@
-/*
-Copyright (c) 2003-2014 Erwin Coumans  http://bullet.googlecode.com
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it freely,
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-
-#ifndef BT_TASK_SCHEDULER_H
-#define BT_TASK_SCHEDULER_H
-
-
-class btITaskScheduler;
-
-btITaskScheduler* createDefaultTaskScheduler();
-
-
-#endif // BT_TASK_SCHEDULER_H
--- a/setup.py
+++ b/setup.py
@@ -134,6 +134,9 @@ sources = ["examples/pybullet/pybullet.c"]\
 +["src/LinearMath/btConvexHullComputer.cpp"]\
 +["src/LinearMath/btQuickprof.cpp"]\
 +["src/LinearMath/btThreads.cpp"]\
+["src/LinearMath/TaskScheduler/btTaskScheduler.cpp"]\
+["src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp"]\
+["src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp"]\
 +["src/BulletCollision/BroadphaseCollision/btAxisSweep3.cpp"]\
 +["src/BulletCollision/BroadphaseCollision/btDbvt.cpp"]\
 +["src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp"]\
@@ -233,6 +236,7 @@ sources = ["examples/pybullet/pybullet.c"]\
 +["src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp"]\
 +["src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp"]\
 +["src/BulletDynamics/Dynamics/btSimpleDynamicsWorld.cpp"]\
+["src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp"]\
 +["src/BulletDynamics/ConstraintSolver/btConeTwistConstraint.cpp"]\
 +["src/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.cpp"]\
 +["src/BulletDynamics/ConstraintSolver/btSliderConstraint.cpp"]\
@@ -249,6 +253,7 @@ sources = ["examples/pybullet/pybullet.c"]\
 +["src/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.cpp"]\
 +["src/BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.cpp"]\
 +["src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp"]\
+["src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp"]\
 +["src/BulletDynamics/MLCPSolvers/btDantzigLCP.cpp"]\
 +["src/BulletDynamics/MLCPSolvers/btLemkeAlgorithm.cpp"]\
 +["src/BulletDynamics/MLCPSolvers/btMLCPSolver.cpp"]\
@@ -479,4 +484,3 @@ setup(
    packages=[x for x in find_packages('examples/pybullet/gym')],
    package_data = { 'pybullet_data': need_files }
 )
-
--- a/src/BulletDynamics/CMakeLists.txt
+++ b/src/BulletDynamics/CMakeLists.txt
@@ -15,6 +15,8 @@ SET(BulletDynamics_SRCS
 	ConstraintSolver/btHingeConstraint.cpp
 	ConstraintSolver/btPoint2PointConstraint.cpp
 	ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
+	ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
+	ConstraintSolver/btBatchedConstraints.cpp
 	ConstraintSolver/btNNCGConstraintSolver.cpp
 	ConstraintSolver/btSliderConstraint.cpp
 	ConstraintSolver/btSolve2LinearConstraint.cpp
@@ -62,6 +64,7 @@ SET(ConstraintSolver_HDRS
 	ConstraintSolver/btJacobianEntry.h
 	ConstraintSolver/btPoint2PointConstraint.h
 	ConstraintSolver/btSequentialImpulseConstraintSolver.h
+	ConstraintSolver/btSequentialImpulseConstraintSolverMt.h
 	ConstraintSolver/btNNCGConstraintSolver.h
 	ConstraintSolver/btSliderConstraint.h
 	ConstraintSolver/btSolve2LinearConstraint.h
--- a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
+++ b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
--- a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.h
+++ b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.h
@@ -0,0 +1,66 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_BATCHED_CONSTRAINTS_H
+#define BT_BATCHED_CONSTRAINTS_H
+
+#include "LinearMath/btThreads.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "BulletDynamics/ConstraintSolver/btSolverBody.h"
+#include "BulletDynamics/ConstraintSolver/btSolverConstraint.h"
+
+
+class btIDebugDraw;
+
+struct btBatchedConstraints
+{
+    enum BatchingMethod
+    {
+        BATCHING_METHOD_SPATIAL_GRID_2D,
+        BATCHING_METHOD_SPATIAL_GRID_3D,
+        BATCHING_METHOD_COUNT
+    };
+    struct Range
+    {
+        int begin;
+        int end;
+
+        Range() : begin( 0 ), end( 0 ) {}
+        Range( int _beg, int _end ) : begin( _beg ), end( _end ) {}
+    };
+
+    btAlignedObjectArray<int> m_constraintIndices;
+    btAlignedObjectArray<Range> m_batches;  // each batch is a range of indices in the m_constraintIndices array
+    btAlignedObjectArray<Range> m_phases;  // each phase is range of indices in the m_batches array
+    btAlignedObjectArray<char> m_phaseGrainSize;  // max grain size for each phase
+    btAlignedObjectArray<int> m_phaseOrder;  // phases can be done in any order, so we can randomize the order here
+    btIDebugDraw* m_debugDrawer;
+
+    static bool s_debugDrawBatches;
+
+    btBatchedConstraints() {m_debugDrawer=NULL;}
+    void setup( btConstraintArray* constraints,
+        const btAlignedObjectArray<btSolverBody>& bodies,
+        BatchingMethod batchingMethod,
+        int minBatchSize,
+        int maxBatchSize,
+        btAlignedObjectArray<char>* scratchMemory
+    );
+    bool validate( btConstraintArray* constraints, const btAlignedObjectArray<btSolverBody>& bodies ) const;
+};
+
+
+#endif // BT_BATCHED_CONSTRAINTS_H
+
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
@@ -1258,6 +1258,256 @@ void btSequentialImpulseConstraintSolver::convertContacts(btPersistentManifold**
 	}
 }

+
+void btSequentialImpulseConstraintSolver::convertJoint(btSolverConstraint* currentConstraintRow,
+    btTypedConstraint* constraint,
+    const btTypedConstraint::btConstraintInfo1& info1,
+    int solverBodyIdA,
+    int solverBodyIdB,
+    const btContactSolverInfo& infoGlobal
+    )
+{
+	const btRigidBody& rbA = constraint->getRigidBodyA();
+	const btRigidBody& rbB = constraint->getRigidBodyB();
+
+    const btSolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA];
+    const btSolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB];
+
+	int overrideNumSolverIterations = constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;
+	if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)
+		m_maxOverrideNumSolverIterations = overrideNumSolverIterations;
+
+	for (int j=0;j<info1.m_numConstraintRows;j++)
+	{
+		memset(&currentConstraintRow[j],0,sizeof(btSolverConstraint));
+		currentConstraintRow[j].m_lowerLimit = -SIMD_INFINITY;
+		currentConstraintRow[j].m_upperLimit = SIMD_INFINITY;
+		currentConstraintRow[j].m_appliedImpulse = 0.f;
+		currentConstraintRow[j].m_appliedPushImpulse = 0.f;
+		currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;
+		currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;
+		currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;
+	}
+
+    // these vectors are already cleared in initSolverBody, no need to redundantly clear again
+    btAssert(bodyAPtr->getDeltaLinearVelocity().isZero());
+    btAssert(bodyAPtr->getDeltaAngularVelocity().isZero());
+    btAssert(bodyAPtr->getPushVelocity().isZero());
+    btAssert(bodyAPtr->getTurnVelocity().isZero());
+    btAssert(bodyBPtr->getDeltaLinearVelocity().isZero());
+    btAssert(bodyBPtr->getDeltaAngularVelocity().isZero());
+    btAssert(bodyBPtr->getPushVelocity().isZero());
+    btAssert(bodyBPtr->getTurnVelocity().isZero());
+	//bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
+	//bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
+	//bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
+	//bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
+	//bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
+	//bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
+	//bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
+	//bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
+
+
+	btTypedConstraint::btConstraintInfo2 info2;
+	info2.fps = 1.f/infoGlobal.m_timeStep;
+	info2.erp = infoGlobal.m_erp;
+	info2.m_J1linearAxis = currentConstraintRow->m_contactNormal1;
+	info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal;
+	info2.m_J2linearAxis = currentConstraintRow->m_contactNormal2;
+	info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal;
+	info2.rowskip = sizeof(btSolverConstraint)/sizeof(btScalar);//check this
+	///the size of btSolverConstraint needs be a multiple of btScalar
+	btAssert(info2.rowskip*sizeof(btScalar)== sizeof(btSolverConstraint));
+	info2.m_constraintError = &currentConstraintRow->m_rhs;
+	currentConstraintRow->m_cfm = infoGlobal.m_globalCfm;
+	info2.m_damping = infoGlobal.m_damping;
+	info2.cfm = &currentConstraintRow->m_cfm;
+	info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;
+	info2.m_upperLimit = &currentConstraintRow->m_upperLimit;
+	info2.m_numIterations = infoGlobal.m_numIterations;
+	constraint->getInfo2(&info2);
+
+	///finalize the constraint setup
+	for (int j=0;j<info1.m_numConstraintRows;j++)
+	{
+		btSolverConstraint& solverConstraint = currentConstraintRow[j];
+
+		if (solverConstraint.m_upperLimit>=constraint->getBreakingImpulseThreshold())
+		{
+			solverConstraint.m_upperLimit = constraint->getBreakingImpulseThreshold();
+		}
+
+		if (solverConstraint.m_lowerLimit<=-constraint->getBreakingImpulseThreshold())
+		{
+			solverConstraint.m_lowerLimit = -constraint->getBreakingImpulseThreshold();
+		}
+
+		solverConstraint.m_originalContactPoint = constraint;
+
+		{
+			const btVector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal;
+			solverConstraint.m_angularComponentA = constraint->getRigidBodyA().getInvInertiaTensorWorld()*ftorqueAxis1*constraint->getRigidBodyA().getAngularFactor();
+		}
+		{
+			const btVector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal;
+			solverConstraint.m_angularComponentB = constraint->getRigidBodyB().getInvInertiaTensorWorld()*ftorqueAxis2*constraint->getRigidBodyB().getAngularFactor();
+		}
+
+		{
+			btVector3 iMJlA = solverConstraint.m_contactNormal1*rbA.getInvMass();
+			btVector3 iMJaA = rbA.getInvInertiaTensorWorld()*solverConstraint.m_relpos1CrossNormal;
+			btVector3 iMJlB = solverConstraint.m_contactNormal2*rbB.getInvMass();//sign of normal?
+			btVector3 iMJaB = rbB.getInvInertiaTensorWorld()*solverConstraint.m_relpos2CrossNormal;
+
+			btScalar sum = iMJlA.dot(solverConstraint.m_contactNormal1);
+			sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
+			sum += iMJlB.dot(solverConstraint.m_contactNormal2);
+			sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
+			btScalar fsum = btFabs(sum);
+			btAssert(fsum > SIMD_EPSILON);
+			btScalar sorRelaxation = 1.f;//todo: get from globalInfo?
+			solverConstraint.m_jacDiagABInv = fsum>SIMD_EPSILON?sorRelaxation/sum : 0.f;
+		}
+
+		{
+			btScalar rel_vel;
+			btVector3 externalForceImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalForceImpulse : btVector3(0,0,0);
+			btVector3 externalTorqueImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalTorqueImpulse : btVector3(0,0,0);
+
+			btVector3 externalForceImpulseB = bodyBPtr->m_originalBody ? bodyBPtr->m_externalForceImpulse : btVector3(0,0,0);
+			btVector3 externalTorqueImpulseB = bodyBPtr->m_originalBody ?bodyBPtr->m_externalTorqueImpulse : btVector3(0,0,0);
+
+			btScalar vel1Dotn = solverConstraint.m_contactNormal1.dot(rbA.getLinearVelocity()+externalForceImpulseA)
+								+ solverConstraint.m_relpos1CrossNormal.dot(rbA.getAngularVelocity()+externalTorqueImpulseA);
+
+			btScalar vel2Dotn = solverConstraint.m_contactNormal2.dot(rbB.getLinearVelocity()+externalForceImpulseB)
+												+ solverConstraint.m_relpos2CrossNormal.dot(rbB.getAngularVelocity()+externalTorqueImpulseB);
+
+			rel_vel = vel1Dotn+vel2Dotn;
+			btScalar restitution = 0.f;
+			btScalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2
+			btScalar	velocityError = restitution - rel_vel * info2.m_damping;
+			btScalar	penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv;
+			btScalar	velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv;
+			solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
+			solverConstraint.m_appliedImpulse = 0.f;
+		}
+	}
+}
+
+
+void btSequentialImpulseConstraintSolver::convertJoints(btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal)
+{
+    BT_PROFILE("convertJoints");
+	for (int j=0;j<numConstraints;j++)
+	{
+		btTypedConstraint* constraint = constraints[j];
+		constraint->buildJacobian();
+		constraint->internalSetAppliedImpulse(0.0f);
+	}
+
+	int totalNumRows = 0;
+
+	m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints);
+	//calculate the total number of contraint rows
+	for (int i=0;i<numConstraints;i++)
+	{
+		btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
+		btJointFeedback* fb = constraints[i]->getJointFeedback();
+		if (fb)
+		{
+			fb->m_appliedForceBodyA.setZero();
+			fb->m_appliedTorqueBodyA.setZero();
+			fb->m_appliedForceBodyB.setZero();
+			fb->m_appliedTorqueBodyB.setZero();
+		}
+
+		if (constraints[i]->isEnabled())
+		{
+			constraints[i]->getInfo1(&info1);
+		} else
+		{
+			info1.m_numConstraintRows = 0;
+			info1.nub = 0;
+		}
+		totalNumRows += info1.m_numConstraintRows;
+	}
+	m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows);
+
+
+	///setup the btSolverConstraints
+	int currentRow = 0;
+
+	for (int i=0;i<numConstraints;i++)
+	{
+		const btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
+
+		if (info1.m_numConstraintRows)
+		{
+			btAssert(currentRow<totalNumRows);
+
+			btSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[currentRow];
+			btTypedConstraint* constraint = constraints[i];
+			btRigidBody& rbA = constraint->getRigidBodyA();
+			btRigidBody& rbB = constraint->getRigidBodyB();
+
+			int solverBodyIdA = getOrInitSolverBody(rbA,infoGlobal.m_timeStep);
+            int solverBodyIdB = getOrInitSolverBody(rbB,infoGlobal.m_timeStep);
+
+            convertJoint(currentConstraintRow, constraint, info1, solverBodyIdA, solverBodyIdB, infoGlobal);
+        }
+		currentRow+=info1.m_numConstraintRows;
+	}
+}
+
+
+void btSequentialImpulseConstraintSolver::convertBodies(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal)
+{
+    BT_PROFILE("convertBodies");
+	for (int i = 0; i < numBodies; i++)
+	{
+		bodies[i]->setCompanionId(-1);
+	}
+#if BT_THREADSAFE
+    m_kinematicBodyUniqueIdToSolverBodyTable.resize( 0 );
+#endif // BT_THREADSAFE
+
+	m_tmpSolverBodyPool.reserve(numBodies+1);
+	m_tmpSolverBodyPool.resize(0);
+
+	//btSolverBody& fixedBody = m_tmpSolverBodyPool.expand();
+    //initSolverBody(&fixedBody,0);
+
+    for (int i=0;i<numBodies;i++)
+	{
+		int bodyId = getOrInitSolverBody(*bodies[i],infoGlobal.m_timeStep);
+
+		btRigidBody* body = btRigidBody::upcast(bodies[i]);
+		if (body && body->getInvMass())
+		{
+			btSolverBody& solverBody = m_tmpSolverBodyPool[bodyId];
+			btVector3 gyroForce (0,0,0);
+			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_EXPLICIT)
+			{
+				gyroForce = body->computeGyroscopicForceExplicit(infoGlobal.m_maxGyroscopicForce);
+				solverBody.m_externalTorqueImpulse -= gyroForce*body->getInvInertiaTensorWorld()*infoGlobal.m_timeStep;
+			}
+			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_WORLD)
+			{
+				gyroForce = body->computeGyroscopicImpulseImplicit_World(infoGlobal.m_timeStep);
+				solverBody.m_externalTorqueImpulse += gyroForce;
+			}
+			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_BODY)
+			{
+				gyroForce = body->computeGyroscopicImpulseImplicit_Body(infoGlobal.m_timeStep);
+				solverBody.m_externalTorqueImpulse += gyroForce;
+
+			}
+		}
+	}
+}
+
+
 btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
 {
 	m_fixedBodyId = -1;
@@ -1344,250 +1594,13 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 #endif //BT_ADDITIONAL_DEBUG


-	for (int i = 0; i < numBodies; i++)
-	{
-		bodies[i]->setCompanionId(-1);
-	}
-#if BT_THREADSAFE
-    m_kinematicBodyUniqueIdToSolverBodyTable.resize( 0 );
-#endif // BT_THREADSAFE
-
-	m_tmpSolverBodyPool.reserve(numBodies+1);
-	m_tmpSolverBodyPool.resize(0);
-
-	//btSolverBody& fixedBody = m_tmpSolverBodyPool.expand();
-    //initSolverBody(&fixedBody,0);
-
 	//convert all bodies
+    convertBodies(bodies, numBodies, infoGlobal);

-
-	for (int i=0;i<numBodies;i++)
-	{
-		int bodyId = getOrInitSolverBody(*bodies[i],infoGlobal.m_timeStep);
-
-		btRigidBody* body = btRigidBody::upcast(bodies[i]);
-		if (body && body->getInvMass())
-		{
-			btSolverBody& solverBody = m_tmpSolverBodyPool[bodyId];
-			btVector3 gyroForce (0,0,0);
-			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_EXPLICIT)
-			{
-				gyroForce = body->computeGyroscopicForceExplicit(infoGlobal.m_maxGyroscopicForce);
-				solverBody.m_externalTorqueImpulse -= gyroForce*body->getInvInertiaTensorWorld()*infoGlobal.m_timeStep;
-			}
-			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_WORLD)
-			{
-				gyroForce = body->computeGyroscopicImpulseImplicit_World(infoGlobal.m_timeStep);
-				solverBody.m_externalTorqueImpulse += gyroForce;
-			}
-			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_BODY)
-			{
-				gyroForce = body->computeGyroscopicImpulseImplicit_Body(infoGlobal.m_timeStep);
-				solverBody.m_externalTorqueImpulse += gyroForce;
-
-			}
-			
-
-		}
-	}
-
-	if (1)
-	{
-		int j;
-		for (j=0;j<numConstraints;j++)
-		{
-			btTypedConstraint* constraint = constraints[j];
-			constraint->buildJacobian();
-			constraint->internalSetAppliedImpulse(0.0f);
-		}
-	}
-
-	//btRigidBody* rb0=0,*rb1=0;
-
-	//if (1)
-	{
-		{
-
-			int totalNumRows = 0;
-			int i;
-
-			m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints);
-			//calculate the total number of contraint rows
-			for (i=0;i<numConstraints;i++)
-			{
-				btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
-				btJointFeedback* fb = constraints[i]->getJointFeedback();
-				if (fb)
-				{
-					fb->m_appliedForceBodyA.setZero();
-					fb->m_appliedTorqueBodyA.setZero();
-					fb->m_appliedForceBodyB.setZero();
-					fb->m_appliedTorqueBodyB.setZero();
-				}
-
-				if (constraints[i]->isEnabled())
-				{
-					constraints[i]->getInfo1(&info1);
-				} else
-				{
-					info1.m_numConstraintRows = 0;
-					info1.nub = 0;
-				}
-				totalNumRows += info1.m_numConstraintRows;
-			}
-			m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows);
-
-
-			///setup the btSolverConstraints
-			int currentRow = 0;
-
-			for (i=0;i<numConstraints;i++)
-			{
-				const btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
-
-				if (info1.m_numConstraintRows)
-				{
-					btAssert(currentRow<totalNumRows);
-
-					btSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[currentRow];
-					btTypedConstraint* constraint = constraints[i];
-					btRigidBody& rbA = constraint->getRigidBodyA();
-					btRigidBody& rbB = constraint->getRigidBodyB();
-
-					int solverBodyIdA = getOrInitSolverBody(rbA,infoGlobal.m_timeStep);
-                    int solverBodyIdB = getOrInitSolverBody(rbB,infoGlobal.m_timeStep);
-
-                    btSolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA];
-                    btSolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB];
-
-
-
-
-					int overrideNumSolverIterations = constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;
-					if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)
-						m_maxOverrideNumSolverIterations = overrideNumSolverIterations;
-
-
-					int j;
-					for ( j=0;j<info1.m_numConstraintRows;j++)
-					{
-						memset(&currentConstraintRow[j],0,sizeof(btSolverConstraint));
-						currentConstraintRow[j].m_lowerLimit = -SIMD_INFINITY;
-						currentConstraintRow[j].m_upperLimit = SIMD_INFINITY;
-						currentConstraintRow[j].m_appliedImpulse = 0.f;
-						currentConstraintRow[j].m_appliedPushImpulse = 0.f;
-						currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;
-						currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;
-						currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;
-					}
-
-					bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
-					bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
-					bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
-					bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
-					bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
-					bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
-					bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
-					bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
-
-
-					btTypedConstraint::btConstraintInfo2 info2;
-					info2.fps = 1.f/infoGlobal.m_timeStep;
-					info2.erp = infoGlobal.m_erp;
-					info2.m_J1linearAxis = currentConstraintRow->m_contactNormal1;
-					info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal;
-					info2.m_J2linearAxis = currentConstraintRow->m_contactNormal2;
-					info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal;
-					info2.rowskip = sizeof(btSolverConstraint)/sizeof(btScalar);//check this
-					///the size of btSolverConstraint needs be a multiple of btScalar
-		            btAssert(info2.rowskip*sizeof(btScalar)== sizeof(btSolverConstraint));
-					info2.m_constraintError = &currentConstraintRow->m_rhs;
-					currentConstraintRow->m_cfm = infoGlobal.m_globalCfm;
-					info2.m_damping = infoGlobal.m_damping;
-					info2.cfm = &currentConstraintRow->m_cfm;
-					info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;
-					info2.m_upperLimit = &currentConstraintRow->m_upperLimit;
-					info2.m_numIterations = infoGlobal.m_numIterations;
-					constraints[i]->getInfo2(&info2);
-
-					///finalize the constraint setup
-					for ( j=0;j<info1.m_numConstraintRows;j++)
-					{
-						btSolverConstraint& solverConstraint = currentConstraintRow[j];
-
-						if (solverConstraint.m_upperLimit>=constraints[i]->getBreakingImpulseThreshold())
-						{
-							solverConstraint.m_upperLimit = constraints[i]->getBreakingImpulseThreshold();
-						}
-
-						if (solverConstraint.m_lowerLimit<=-constraints[i]->getBreakingImpulseThreshold())
-						{
-							solverConstraint.m_lowerLimit = -constraints[i]->getBreakingImpulseThreshold();
-						}
-
-						solverConstraint.m_originalContactPoint = constraint;
-
-						{
-							const btVector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal;
-							solverConstraint.m_angularComponentA = constraint->getRigidBodyA().getInvInertiaTensorWorld()*ftorqueAxis1*constraint->getRigidBodyA().getAngularFactor();
-						}
-						{
-							const btVector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal;
-							solverConstraint.m_angularComponentB = constraint->getRigidBodyB().getInvInertiaTensorWorld()*ftorqueAxis2*constraint->getRigidBodyB().getAngularFactor();
-						}
-
-						{
-							btVector3 iMJlA = solverConstraint.m_contactNormal1*rbA.getInvMass();
-							btVector3 iMJaA = rbA.getInvInertiaTensorWorld()*solverConstraint.m_relpos1CrossNormal;
-							btVector3 iMJlB = solverConstraint.m_contactNormal2*rbB.getInvMass();//sign of normal?
-							btVector3 iMJaB = rbB.getInvInertiaTensorWorld()*solverConstraint.m_relpos2CrossNormal;
-
-							btScalar sum = iMJlA.dot(solverConstraint.m_contactNormal1);
-							sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
-							sum += iMJlB.dot(solverConstraint.m_contactNormal2);
-							sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
-							btScalar fsum = btFabs(sum);
-							btAssert(fsum > SIMD_EPSILON);
-							btScalar sorRelaxation = 1.f;//todo: get from globalInfo?
-							solverConstraint.m_jacDiagABInv = fsum>SIMD_EPSILON?sorRelaxation/sum : 0.f;
-						}
-
-
-
-						{
-							btScalar rel_vel;
-							btVector3 externalForceImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalForceImpulse : btVector3(0,0,0);
-							btVector3 externalTorqueImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalTorqueImpulse : btVector3(0,0,0);
-
-							btVector3 externalForceImpulseB = bodyBPtr->m_originalBody ? bodyBPtr->m_externalForceImpulse : btVector3(0,0,0);
-							btVector3 externalTorqueImpulseB = bodyBPtr->m_originalBody ?bodyBPtr->m_externalTorqueImpulse : btVector3(0,0,0);
-
-							btScalar vel1Dotn = solverConstraint.m_contactNormal1.dot(rbA.getLinearVelocity()+externalForceImpulseA)
-												+ solverConstraint.m_relpos1CrossNormal.dot(rbA.getAngularVelocity()+externalTorqueImpulseA);
-
-							btScalar vel2Dotn = solverConstraint.m_contactNormal2.dot(rbB.getLinearVelocity()+externalForceImpulseB)
-																+ solverConstraint.m_relpos2CrossNormal.dot(rbB.getAngularVelocity()+externalTorqueImpulseB);
-
-							rel_vel = vel1Dotn+vel2Dotn;
-							btScalar restitution = 0.f;
-							btScalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2
-							btScalar	velocityError = restitution - rel_vel * info2.m_damping;
-							btScalar	penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv;
-							btScalar	velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv;
-							solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
-							solverConstraint.m_appliedImpulse = 0.f;
-
-
-						}
-					}
-				}
-				currentRow+=m_tmpConstraintSizesPool[i].m_numConstraintRows;
-			}
-		}
+    convertJoints(constraints, numConstraints, infoGlobal);

 	convertContacts(manifoldPtr,numManifolds,infoGlobal);

-	}

 //	btContactSolverInfo info = infoGlobal;

@@ -1627,6 +1640,7 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol

 btScalar btSequentialImpulseConstraintSolver::solveSingleIteration(int iteration, btCollisionObject** /*bodies */,int /*numBodies*/,btPersistentManifold** /*manifoldPtr*/, int /*numManifolds*/,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* /*debugDrawer*/)
 {
+    BT_PROFILE("solveSingleIteration");
 	btScalar leastSquaresResidual = 0.f;

 	int numNonContactPool = m_tmpSolverNonContactConstraintPool.size();
@@ -1805,6 +1819,7 @@ btScalar btSequentialImpulseConstraintSolver::solveSingleIteration(int iteration

 void btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
 {
+	BT_PROFILE("solveGroupCacheFriendlySplitImpulseIterations");
 	int iteration;
 	if (infoGlobal.m_splitImpulse)
 	{
@@ -1863,14 +1878,9 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyIterations(
 	return 0.f;
 }

-btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal)
+void btSequentialImpulseConstraintSolver::writeBackContacts(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal)
 {
-	int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
-	int i,j;
-
-	if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
-	{
-		for (j=0;j<numPoolConstraints;j++)
+		for (int j=iBegin; j<iEnd; j++)
 		{
 			const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[j];
 			btManifoldPoint* pt = (btManifoldPoint*) solveManifold.m_originalContactPoint;
@@ -1888,8 +1898,9 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCo
 		}
 }

-	numPoolConstraints = m_tmpSolverNonContactConstraintPool.size();
-	for (j=0;j<numPoolConstraints;j++)
+void btSequentialImpulseConstraintSolver::writeBackJoints(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal)
+{
+	for (int j=iBegin; j<iEnd; j++)
 	{
 		const btSolverConstraint& solverConstr = m_tmpSolverNonContactConstraintPool[j];
 		btTypedConstraint* constr = (btTypedConstraint*)solverConstr.m_originalContactPoint;
@@ -1909,10 +1920,12 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCo
 			constr->setEnabled(false);
 		}
 	}
+}


-
-	for ( i=0;i<m_tmpSolverBodyPool.size();i++)
+void btSequentialImpulseConstraintSolver::writeBackBodies(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal)
+{
+	for (int i=iBegin; i<iEnd; i++)
 	{
 		btRigidBody* body = m_tmpSolverBodyPool[i].m_originalBody;
 		if (body)
@@ -1936,6 +1949,19 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCo
 			m_tmpSolverBodyPool[i].m_originalBody->setCompanionId(-1);
 		}
 	}
+}
+
+btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal)
+{
+	BT_PROFILE("solveGroupCacheFriendlyFinish");
+
+	if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
+	{
+        writeBackContacts(0, m_tmpSolverContactConstraintPool.size(), infoGlobal);
+	}
+
+    writeBackJoints(0, m_tmpSolverNonContactConstraintPool.size(), infoGlobal);
+    writeBackBodies(0, m_tmpSolverBodyPool.size(), infoGlobal);

 	m_tmpSolverContactConstraintPool.resizeNoInitialize(0);
 	m_tmpSolverNonContactConstraintPool.resizeNoInitialize(0);
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
@@ -95,6 +95,10 @@ protected:

 	void	convertContact(btPersistentManifold* manifold,const btContactSolverInfo& infoGlobal);

+    virtual void convertJoints(btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal);
+    void convertJoint(btSolverConstraint* destConstraintRow, btTypedConstraint* srcConstraint, const btTypedConstraint::btConstraintInfo1& info1, int solverBodyIdA, int solverBodyIdB, const btContactSolverInfo& infoGlobal);
+
+    virtual void convertBodies(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal);

 	btSimdScalar	resolveSplitPenetrationSIMD(btSolverBody& bodyA,btSolverBody& bodyB, const btSolverConstraint& contactConstraint)
    {
@@ -121,7 +125,9 @@ protected:
 		
 protected:
 	
-	
+    void writeBackContacts(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
+    void writeBackJoints(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
+    void writeBackBodies(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
 	virtual void solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
 	virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal);
 	virtual btScalar solveSingleIteration(int iteration, btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h
@@ -0,0 +1,154 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SEQUENTIAL_IMPULSE_CONSTRAINT_SOLVER_MT_H
+#define BT_SEQUENTIAL_IMPULSE_CONSTRAINT_SOLVER_MT_H
+
+#include "btSequentialImpulseConstraintSolver.h"
+#include "btBatchedConstraints.h"
+#include "LinearMath/btThreads.h"
+
+///
+/// btSequentialImpulseConstraintSolverMt
+///
+///  A multithreaded variant of the sequential impulse constraint solver. The constraints to be solved are grouped into
+///  batches and phases where each batch of constraints within a given phase can be solved in parallel with the rest.
+///  Ideally we want as few phases as possible, and each phase should have many batches, and all of the batches should
+///  have about the same number of constraints.
+///  This method works best on a large island of many constraints.
+///
+///  Supports all of the features of the normal sequential impulse solver such as:
+///    - split penetration impulse
+///    - rolling friction
+///    - interleaving constraints
+///    - warmstarting
+///    - 2 friction directions
+///    - randomized constraint ordering
+///    - early termination when leastSquaresResidualThreshold is satisfied
+///
+///  When the SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS flag is enabled, unlike the normal SequentialImpulse solver,
+///  the rolling friction is interleaved as well.
+///  Interleaving the contact penetration constraints with friction reduces the number of parallel loops that need to be done,
+///  which reduces threading overhead so it can be a performance win, however, it does seem to produce a less stable simulation,
+///  at least on stacks of blocks.
+///
+///  When the SOLVER_RANDMIZE_ORDER flag is enabled, the ordering of phases, and the ordering of constraints within each batch
+///  is randomized, however it does not swap constraints between batches.
+///  This is to avoid regenerating the batches for each solver iteration which would be quite costly in performance.
+///
+///  Note that a non-zero leastSquaresResidualThreshold could possibly affect the determinism of the simulation
+///  if the task scheduler's parallelSum operation is non-deterministic. The parallelSum operation can be non-deterministic
+///  because floating point addition is not associative due to rounding errors.
+///  The task scheduler can and should ensure that the result of any parallelSum operation is deterministic.
+///
+ATTRIBUTE_ALIGNED16(class) btSequentialImpulseConstraintSolverMt : public btSequentialImpulseConstraintSolver
+{
+public:
+	virtual void solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE;
+	virtual btScalar solveSingleIteration(int iteration, btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE;
+	virtual btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE;
+	virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal) BT_OVERRIDE;
+
+    // temp struct used to collect info from persistent manifolds into a cache-friendly struct using multiple threads
+    struct btContactManifoldCachedInfo
+    {
+        static const int MAX_NUM_CONTACT_POINTS = 4;
+
+        int numTouchingContacts;
+        int solverBodyIds[ 2 ];
+        int contactIndex;
+        int rollingFrictionIndex;
+        bool contactHasRollingFriction[ MAX_NUM_CONTACT_POINTS ];
+        btManifoldPoint* contactPoints[ MAX_NUM_CONTACT_POINTS ];
+    };
+    // temp struct used for setting up joint constraints in parallel
+    struct JointParams
+    {
+        int m_solverConstraint;
+        int m_solverBodyA;
+        int m_solverBodyB;
+    };
+    void internalInitMultipleJoints(btTypedConstraint** constraints, int iBegin, int iEnd);
+    void internalConvertMultipleJoints( const btAlignedObjectArray<JointParams>& jointParamsArray, btTypedConstraint** constraints, int iBegin, int iEnd, const btContactSolverInfo& infoGlobal );
+
+    // parameters to control batching
+    static bool s_allowNestedParallelForLoops;        // whether to allow nested parallel operations
+    static int s_minimumContactManifoldsForBatching;  // don't even try to batch if fewer manifolds than this
+    static btBatchedConstraints::BatchingMethod s_contactBatchingMethod;
+    static btBatchedConstraints::BatchingMethod s_jointBatchingMethod;
+    static int s_minBatchSize;  // desired number of constraints per batch
+    static int s_maxBatchSize;
+
+protected:
+    static const int CACHE_LINE_SIZE = 64;
+
+    btBatchedConstraints m_batchedContactConstraints;
+    btBatchedConstraints m_batchedJointConstraints;
+    int m_numFrictionDirections;
+    bool m_useBatching;
+    bool m_useObsoleteJointConstraints;
+    btAlignedObjectArray<btContactManifoldCachedInfo> m_manifoldCachedInfoArray;
+    btAlignedObjectArray<int> m_rollingFrictionIndexTable;  // lookup table mapping contact index to rolling friction index
+    btSpinMutex m_bodySolverArrayMutex;
+    char m_antiFalseSharingPadding[CACHE_LINE_SIZE]; // padding to keep mutexes in separate cachelines
+    btSpinMutex m_kinematicBodyUniqueIdToSolverBodyTableMutex;
+    btAlignedObjectArray<char> m_scratchMemory;
+
+    virtual void randomizeConstraintOrdering( int iteration, int numIterations );
+    virtual btScalar resolveAllJointConstraints( int iteration );
+    virtual btScalar resolveAllContactConstraints();
+    virtual btScalar resolveAllContactFrictionConstraints();
+    virtual btScalar resolveAllContactConstraintsInterleaved();
+    virtual btScalar resolveAllRollingFrictionConstraints();
+
+    virtual void setupBatchedContactConstraints();
+    virtual void setupBatchedJointConstraints();
+    virtual void convertJoints(btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal) BT_OVERRIDE;
+	virtual void convertContacts(btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal) BT_OVERRIDE;
+    virtual void convertBodies(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal) BT_OVERRIDE;
+
+	int getOrInitSolverBodyThreadsafe(btCollisionObject& body, btScalar timeStep);
+    void allocAllContactConstraints(btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal);
+    void setupAllContactConstraints(const btContactSolverInfo& infoGlobal);
+    void randomizeBatchedConstraintOrdering( btBatchedConstraints* batchedConstraints );
+
+public:
+
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
+	btSequentialImpulseConstraintSolverMt();
+	virtual ~btSequentialImpulseConstraintSolverMt();
+
+    btScalar resolveMultipleJointConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd, int iteration );
+    btScalar resolveMultipleContactConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
+    btScalar resolveMultipleContactSplitPenetrationImpulseConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
+    btScalar resolveMultipleContactFrictionConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
+    btScalar resolveMultipleContactRollingFrictionConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
+    btScalar resolveMultipleContactConstraintsInterleaved( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
+
+    void internalCollectContactManifoldCachedInfo(btContactManifoldCachedInfo* cachedInfoArray, btPersistentManifold** manifold, int numManifolds, const btContactSolverInfo& infoGlobal);
+    void internalAllocContactConstraints(const btContactManifoldCachedInfo* cachedInfoArray, int numManifolds);
+    void internalSetupContactConstraints(int iContact, const btContactSolverInfo& infoGlobal);
+    void internalConvertBodies(btCollisionObject** bodies, int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
+    void internalWriteBackContacts(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
+    void internalWriteBackJoints(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
+    void internalWriteBackBodies(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
+};
+
+
+
+
+#endif //BT_SEQUENTIAL_IMPULSE_CONSTRAINT_SOLVER_MT_H
+
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
@@ -50,63 +50,6 @@ subject to the following restrictions:
 #include "LinearMath/btSerializer.h"


-struct InplaceSolverIslandCallbackMt : public btSimulationIslandManagerMt::IslandCallback
-{
-	btContactSolverInfo*	m_solverInfo;
-	btConstraintSolver*		m_solver;
-	btIDebugDraw*			m_debugDrawer;
-	btDispatcher*			m_dispatcher;
-
-	InplaceSolverIslandCallbackMt(
-		btConstraintSolver*	solver,
-		btStackAlloc* stackAlloc,
-		btDispatcher* dispatcher)
-		:m_solverInfo(NULL),
-		m_solver(solver),
-		m_debugDrawer(NULL),
-		m_dispatcher(dispatcher)
-	{
-
-	}
-
-	InplaceSolverIslandCallbackMt& operator=(InplaceSolverIslandCallbackMt& other)
-	{
-		btAssert(0);
-		(void)other;
-		return *this;
-	}
-
-	SIMD_FORCE_INLINE void setup ( btContactSolverInfo* solverInfo, btIDebugDraw* debugDrawer)
-	{
-		btAssert(solverInfo);
-		m_solverInfo = solverInfo;
-		m_debugDrawer = debugDrawer;
-	}
-
-
-	virtual	void	processIsland( btCollisionObject** bodies,
-                                   int numBodies,
-                                   btPersistentManifold** manifolds,
-                                   int numManifolds,
-                                   btTypedConstraint** constraints,
-                                   int numConstraints,
-                                   int islandId
-                                   )
-	{
-        m_solver->solveGroup( bodies,
-                              numBodies,
-                              manifolds,
-                              numManifolds,
-                              constraints,
-                              numConstraints,
-                              *m_solverInfo,
-                              m_debugDrawer,
-                              m_dispatcher
-                              );
-    }
-
-};
-

 ///
 /// btConstraintSolverPoolMt
@@ -209,7 +152,12 @@ void btConstraintSolverPoolMt::reset()
 /// btDiscreteDynamicsWorldMt
 ///

-btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher, btBroadphaseInterface* pairCache, btConstraintSolverPoolMt* constraintSolver, btCollisionConfiguration* collisionConfiguration)
+btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,
+    btBroadphaseInterface* pairCache,
+    btConstraintSolverPoolMt* constraintSolver,
+    btConstraintSolver* constraintSolverMt,
+    btCollisionConfiguration* collisionConfiguration
+)
 : btDiscreteDynamicsWorld(dispatcher,pairCache,constraintSolver,collisionConfiguration)
 {
 	if (m_ownsIslandManager)
@@ -217,31 +165,18 @@ btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher, b
 		m_islandManager->~btSimulationIslandManager();
 		btAlignedFree( m_islandManager);
 	}
-    {
-		void* mem = btAlignedAlloc(sizeof(InplaceSolverIslandCallbackMt),16);
-		m_solverIslandCallbackMt = new (mem) InplaceSolverIslandCallbackMt (m_constraintSolver, 0, dispatcher);
-    }
 	{
 		void* mem = btAlignedAlloc(sizeof(btSimulationIslandManagerMt),16);
 		btSimulationIslandManagerMt* im = new (mem) btSimulationIslandManagerMt();
        im->setMinimumSolverBatchSize( m_solverInfo.m_minimumSolverBatchSize );
        m_islandManager = im;
 	}
+    m_constraintSolverMt = constraintSolverMt;
 }


 btDiscreteDynamicsWorldMt::~btDiscreteDynamicsWorldMt()
 {
-	if (m_solverIslandCallbackMt)
-	{
-		m_solverIslandCallbackMt->~InplaceSolverIslandCallbackMt();
-		btAlignedFree(m_solverIslandCallbackMt);
-	}
-	if (m_ownsConstraintSolver)
-	{
-		m_constraintSolver->~btConstraintSolver();
-		btAlignedFree(m_constraintSolver);
-	}
 }


@@ -249,12 +184,17 @@ void btDiscreteDynamicsWorldMt::solveConstraints(btContactSolverInfo& solverInfo
 {
 	BT_PROFILE("solveConstraints");

-	m_solverIslandCallbackMt->setup(&solverInfo, getDebugDrawer());
 	m_constraintSolver->prepareSolve(getCollisionWorld()->getNumCollisionObjects(), getCollisionWorld()->getDispatcher()->getNumManifolds());

 	/// solve all the constraints for this island
    btSimulationIslandManagerMt* im = static_cast<btSimulationIslandManagerMt*>(m_islandManager);
-    im->buildAndProcessIslands( getCollisionWorld()->getDispatcher(), getCollisionWorld(), m_constraints, m_solverIslandCallbackMt );
+    btSimulationIslandManagerMt::SolverParams solverParams;
+    solverParams.m_solverPool = m_constraintSolver;
+    solverParams.m_solverMt = m_constraintSolverMt;
+    solverParams.m_solverInfo = &solverInfo;
+    solverParams.m_debugDrawer = m_debugDrawer;
+    solverParams.m_dispatcher = getCollisionWorld()->getDispatcher();
+    im->buildAndProcessIslands( getCollisionWorld()->getDispatcher(), getCollisionWorld(), m_constraints, solverParams );

 	m_constraintSolver->allSolved(solverInfo, m_debugDrawer);
 }
@@ -325,3 +265,14 @@ void btDiscreteDynamicsWorldMt::integrateTransforms( btScalar timeStep )
    }
 }

+
+int	btDiscreteDynamicsWorldMt::stepSimulation( btScalar timeStep, int maxSubSteps, btScalar fixedTimeStep )
+{
+    int numSubSteps = btDiscreteDynamicsWorld::stepSimulation(timeStep, maxSubSteps, fixedTimeStep);
+    if (btITaskScheduler* scheduler = btGetTaskScheduler())
+    {
+        // tell Bullet's threads to sleep, so other threads can run
+        scheduler->sleepWorkerThreadsHint();
+    }
+    return numSubSteps;
+}
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
@@ -21,7 +21,6 @@ subject to the following restrictions:
 #include "btSimulationIslandManagerMt.h"
 #include "BulletDynamics/ConstraintSolver/btConstraintSolver.h"

-struct InplaceSolverIslandCallbackMt;

 ///
 /// btConstraintSolverPoolMt - masquerades as a constraint solver, but really it is a threadsafe pool of them.
@@ -88,7 +87,7 @@ private:
 ATTRIBUTE_ALIGNED16(class) btDiscreteDynamicsWorldMt : public btDiscreteDynamicsWorld
 {
 protected:
-    InplaceSolverIslandCallbackMt* m_solverIslandCallbackMt;
+    btConstraintSolver* m_constraintSolverMt;

    virtual void solveConstraints(btContactSolverInfo& solverInfo) BT_OVERRIDE;

@@ -126,9 +125,12 @@ public:
 	btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,
        btBroadphaseInterface* pairCache,
        btConstraintSolverPoolMt* constraintSolver,   // Note this should be a solver-pool for multi-threading
+        btConstraintSolver* constraintSolverMt,    // single multi-threaded solver for large islands (or NULL)
        btCollisionConfiguration* collisionConfiguration
    );
 	virtual ~btDiscreteDynamicsWorldMt();
+
+    virtual int	stepSimulation( btScalar timeStep, int maxSubSteps, btScalar fixedTimeStep ) BT_OVERRIDE;
 };

 #endif //BT_DISCRETE_DYNAMICS_WORLD_H
--- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
+++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
@@ -22,6 +22,7 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletCollision/CollisionDispatch/btCollisionWorld.h"
 #include "BulletDynamics/ConstraintSolver/btTypedConstraint.h"
+#include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h"  // for s_minimumContactManifoldsForBatching

 //#include <stdio.h>
 #include "LinearMath/btQuickprof.h"
@@ -275,7 +276,7 @@ btSimulationIslandManagerMt::Island* btSimulationIslandManagerMt::allocateIsland
 void btSimulationIslandManagerMt::buildIslands( btDispatcher* dispatcher, btCollisionWorld* collisionWorld )
 {

-	BT_PROFILE("islandUnionFindAndQuickSort");
+	BT_PROFILE("buildIslands");
 	
 	btCollisionObjectArray& collisionObjects = collisionWorld->getCollisionObjectArray();

@@ -544,59 +545,103 @@ void btSimulationIslandManagerMt::mergeIslands()
 }


-void btSimulationIslandManagerMt::serialIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback )
+void btSimulationIslandManagerMt::solveIsland(btConstraintSolver* solver, Island& island, const SolverParams& solverParams)
+{
+    btPersistentManifold** manifolds = island.manifoldArray.size() ? &island.manifoldArray[ 0 ] : NULL;
+    btTypedConstraint** constraintsPtr = island.constraintArray.size() ? &island.constraintArray[ 0 ] : NULL;
+    solver->solveGroup( &island.bodyArray[ 0 ],
+        island.bodyArray.size(),
+        manifolds,
+        island.manifoldArray.size(),
+        constraintsPtr,
+        island.constraintArray.size(),
+        *solverParams.m_solverInfo,
+        solverParams.m_debugDrawer,
+        solverParams.m_dispatcher
+    );
+}
+
+
+void btSimulationIslandManagerMt::serialIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, const SolverParams& solverParams )
 {
    BT_PROFILE( "serialIslandDispatch" );
    // serial dispatch
    btAlignedObjectArray<Island*>& islands = *islandsPtr;
+    btConstraintSolver* solver = solverParams.m_solverMt ? solverParams.m_solverMt : solverParams.m_solverPool;
    for ( int i = 0; i < islands.size(); ++i )
    {
-        Island* island = islands[ i ];
-        btPersistentManifold** manifolds = island->manifoldArray.size() ? &island->manifoldArray[ 0 ] : NULL;
-        btTypedConstraint** constraintsPtr = island->constraintArray.size() ? &island->constraintArray[ 0 ] : NULL;
-        callback->processIsland( &island->bodyArray[ 0 ],
-                                 island->bodyArray.size(),
-                                 manifolds,
-                                 island->manifoldArray.size(),
-                                 constraintsPtr,
-                                 island->constraintArray.size(),
-                                 island->id
-                                 );
+        solveIsland(solver, *islands[ i ], solverParams);
    }
 }

+
 struct UpdateIslandDispatcher : public btIParallelForBody
 {
-    btAlignedObjectArray<btSimulationIslandManagerMt::Island*>* islandsPtr;
-    btSimulationIslandManagerMt::IslandCallback* callback;
+    btAlignedObjectArray<btSimulationIslandManagerMt::Island*>& m_islandsPtr;
+    const btSimulationIslandManagerMt::SolverParams& m_solverParams;
+
+    UpdateIslandDispatcher(btAlignedObjectArray<btSimulationIslandManagerMt::Island*>& islandsPtr, const btSimulationIslandManagerMt::SolverParams& solverParams)
+        : m_islandsPtr(islandsPtr), m_solverParams(solverParams)
+    {}

    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
    {
+        btConstraintSolver* solver = m_solverParams.m_solverPool;
        for ( int i = iBegin; i < iEnd; ++i )
        {
-            btSimulationIslandManagerMt::Island* island = ( *islandsPtr )[ i ];
-            btPersistentManifold** manifolds = island->manifoldArray.size() ? &island->manifoldArray[ 0 ] : NULL;
-            btTypedConstraint** constraintsPtr = island->constraintArray.size() ? &island->constraintArray[ 0 ] : NULL;
-            callback->processIsland( &island->bodyArray[ 0 ],
-                island->bodyArray.size(),
-                manifolds,
-                island->manifoldArray.size(),
-                constraintsPtr,
-                island->constraintArray.size(),
-                island->id
-            );
+            btSimulationIslandManagerMt::Island* island = m_islandsPtr[ i ];
+            btSimulationIslandManagerMt::solveIsland( solver, *island, m_solverParams );
        }
    }
 };

-void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback )
+
+void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, const SolverParams& solverParams )
 {
    BT_PROFILE( "parallelIslandDispatch" );
-    int grainSize = 1;  // iterations per task
-    UpdateIslandDispatcher dispatcher;
-    dispatcher.islandsPtr = islandsPtr;
-    dispatcher.callback = callback;
-    btParallelFor( 0, islandsPtr->size(), grainSize, dispatcher );
+    //
+    // if there are islands with many contacts, it may be faster to submit these
+    // large islands *serially* to a single parallel constraint solver, and then later
+    // submit the remaining smaller islands in parallel to multiple sequential solvers.
+    //
+    // Some task schedulers do not deal well with nested parallelFor loops. One implementation
+    // of OpenMP was actually slower than doing everything single-threaded. Intel TBB
+    // on the other hand, seems to do a pretty respectable job with it.
+    //
+    // When solving islands in parallel, the worst case performance happens when there
+    // is one very large island and then perhaps a smattering of very small
+    // islands -- one worker thread takes the large island and the remaining workers
+    // tear through the smaller islands and then sit idle waiting for the first worker
+    // to finish. Solving islands in parallel works best when there are numerous small
+    // islands, roughly equal in size.
+    //
+    // By contrast, the other approach -- the parallel constraint solver -- is only
+    // able to deliver a worthwhile speedup when the island is large. For smaller islands,
+    // it is difficult to extract a useful amount of parallelism -- the overhead of grouping
+    // the constraints into batches and sending the batches to worker threads can nullify
+    // any gains from parallelism.
+    //
+
+    UpdateIslandDispatcher dispatcher(*islandsPtr, solverParams);
+    // We take advantage of the fact the islands are sorted in order of decreasing size
+    int iBegin = 0;
+    if (solverParams.m_solverMt)
+    {
+        while ( iBegin < islandsPtr->size() )
+        {
+            btSimulationIslandManagerMt::Island* island = ( *islandsPtr )[ iBegin ];
+            if ( island->manifoldArray.size() < btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching )
+            {
+                // OK to submit the rest of the array in parallel
+                break;
+            }
+            // serial dispatch to parallel solver for large islands (if any)
+            solveIsland(solverParams.m_solverMt, *island, solverParams);
+            ++iBegin;
+        }
+    }
+    // parallel dispatch to sequential solvers for rest
+    btParallelFor( iBegin, islandsPtr->size(), 1, dispatcher );
 }


@@ -604,15 +649,14 @@ void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray<I
 void btSimulationIslandManagerMt::buildAndProcessIslands( btDispatcher* dispatcher,
                                                        btCollisionWorld* collisionWorld,
                                                        btAlignedObjectArray<btTypedConstraint*>& constraints,
-                                                        IslandCallback* callback
+                                                        const SolverParams& solverParams
                                                        )
 {
+	BT_PROFILE("buildAndProcessIslands");
 	btCollisionObjectArray& collisionObjects = collisionWorld->getCollisionObjectArray();

 	buildIslands(dispatcher,collisionWorld);

-	BT_PROFILE("processIslands");
-
 	if(!getSplitIslands())
 	{
        btPersistentManifold** manifolds = dispatcher->getInternalManifoldPointer();
@@ -644,13 +688,16 @@ void btSimulationIslandManagerMt::buildAndProcessIslands( btDispatcher* dispatch
            }
        }
        btTypedConstraint** constraintsPtr = constraints.size() ? &constraints[ 0 ] : NULL;
-		callback->processIsland(&collisionObjects[0],
+        btConstraintSolver* solver = solverParams.m_solverMt ? solverParams.m_solverMt : solverParams.m_solverPool;
+        solver->solveGroup(&collisionObjects[0],
                           collisionObjects.size(),
                           manifolds,
                           maxNumManifolds,
                           constraintsPtr,
                           constraints.size(),
-                                 -1
+                           *solverParams.m_solverInfo,
+                           solverParams.m_debugDrawer,
+                           solverParams.m_dispatcher
                           );
 	}
 	else
@@ -671,6 +718,6 @@ void btSimulationIslandManagerMt::buildAndProcessIslands( btDispatcher* dispatch
            mergeIslands();
        }
        // dispatch islands to solver
-        m_islandDispatch( &m_activeIslands, callback );
+        m_islandDispatch( &m_activeIslands, solverParams );
 	}
 }
--- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
+++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
@@ -19,7 +19,9 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btSimulationIslandManager.h"

 class btTypedConstraint;
-
+class btConstraintSolver;
+struct btContactSolverInfo;
+class btIDebugDraw;

 ///
 /// SimulationIslandManagerMt -- Multithread capable version of SimulationIslandManager
@@ -45,22 +47,19 @@ public:

        void append( const Island& other );  // add bodies, manifolds, constraints to my own
    };
-    struct	IslandCallback
+    struct SolverParams
    {
-        virtual ~IslandCallback() {};
-
-        virtual	void processIsland( btCollisionObject** bodies,
-                                    int numBodies,
-                                    btPersistentManifold** manifolds,
-                                    int numManifolds,
-                                    btTypedConstraint** constraints,
-                                    int numConstraints,
-                                    int islandId
-                                    ) = 0;
+        btConstraintSolver*		m_solverPool;
+        btConstraintSolver*		m_solverMt;
+        btContactSolverInfo*	m_solverInfo;
+        btIDebugDraw*			m_debugDrawer;
+        btDispatcher*			m_dispatcher;
    };
-    typedef void( *IslandDispatchFunc ) ( btAlignedObjectArray<Island*>* islands, IslandCallback* callback );
-    static void serialIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback );
-    static void parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback );
+    static void solveIsland(btConstraintSolver* solver, Island& island, const SolverParams& solverParams);
+
+    typedef void( *IslandDispatchFunc ) ( btAlignedObjectArray<Island*>* islands, const SolverParams& solverParams );
+    static void serialIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, const SolverParams& solverParams );
+    static void parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, const SolverParams& solverParams );
 protected:
    btAlignedObjectArray<Island*> m_allocatedIslands;  // owner of all Islands
    btAlignedObjectArray<Island*> m_activeIslands;  // islands actively in use
@@ -83,7 +82,11 @@ public:
 	btSimulationIslandManagerMt();
 	virtual ~btSimulationIslandManagerMt();

-    virtual void buildAndProcessIslands( btDispatcher* dispatcher, btCollisionWorld* collisionWorld, btAlignedObjectArray<btTypedConstraint*>& constraints, IslandCallback* callback );
+    virtual void buildAndProcessIslands( btDispatcher* dispatcher,
+        btCollisionWorld* collisionWorld,
+        btAlignedObjectArray<btTypedConstraint*>& constraints,
+        const SolverParams& solverParams
+    );

 	virtual void buildIslands(btDispatcher* dispatcher,btCollisionWorld* colWorld);

@@ -106,5 +109,6 @@ public:
    }
 };

+
 #endif //BT_SIMULATION_ISLAND_MANAGER_H

--- a/src/LinearMath/CMakeLists.txt
+++ b/src/LinearMath/CMakeLists.txt
@@ -14,6 +14,9 @@ SET(LinearMath_SRCS
 	btSerializer64.cpp
 	btThreads.cpp
 	btVector3.cpp
+	TaskScheduler/btTaskScheduler.cpp
+	TaskScheduler/btThreadSupportPosix.cpp
+	TaskScheduler/btThreadSupportWin32.cpp
 )

 SET(LinearMath_HDRS
@@ -44,6 +47,7 @@ SET(LinearMath_HDRS
 	btTransform.h
 	btTransformUtil.h
 	btVector3.h
+	TaskScheduler/btThreadSupportInterface.h
 )

 ADD_LIBRARY(LinearMath ${LinearMath_SRCS} ${LinearMath_HDRS})
--- a/src/LinearMath/TaskScheduler/btTaskScheduler.cpp
+++ b/src/LinearMath/TaskScheduler/btTaskScheduler.cpp
@@ -0,0 +1,788 @@
+
+#include "LinearMath/btMinMax.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "LinearMath/btThreads.h"
+#include "LinearMath/btQuickprof.h"
+#include <stdio.h>
+#include <algorithm>
+
+
+
+#if BT_THREADSAFE
+
+#include "btThreadSupportInterface.h"
+
+#if defined( _WIN32 )
+
+#define WIN32_LEAN_AND_MEAN
+
+#include <windows.h>
+
+#endif
+
+
+typedef unsigned long long btU64;
+static const int kCacheLineSize = 64;
+
+void btSpinPause()
+{
+#if defined( _WIN32 )
+    YieldProcessor();
+#endif
+}
+
+
+struct WorkerThreadStatus
+{
+    enum Type
+    {
+        kInvalid,
+        kWaitingForWork,
+        kWorking,
+        kSleeping,
+    };
+};
+
+
+ATTRIBUTE_ALIGNED64(class) WorkerThreadDirectives
+{
+    static const int kMaxThreadCount = BT_MAX_THREAD_COUNT;
+    // directives for all worker threads packed into a single cacheline
+    char m_threadDirs[kMaxThreadCount];
+
+public:
+    enum Type
+    {
+        kInvalid,
+        kGoToSleep,         // go to sleep
+        kStayAwakeButIdle,  // wait for not checking job queue
+        kScanForJobs,       // actively scan job queue for jobs
+    };
+    WorkerThreadDirectives()
+    {
+        for ( int i = 0; i < kMaxThreadCount; ++i )
+        {
+            m_threadDirs[ i ] = 0;
+        }
+    }
+
+    Type getDirective(int threadId)
+    {
+        btAssert(threadId < kMaxThreadCount);
+        return static_cast<Type>(m_threadDirs[threadId]);
+    }
+
+    void setDirectiveByRange(int threadBegin, int threadEnd, Type dir)
+    {
+        btAssert( threadBegin < threadEnd );
+        btAssert( threadEnd <= kMaxThreadCount );
+        char dirChar = static_cast<char>(dir);
+        for ( int i = threadBegin; i < threadEnd; ++i )
+        {
+            m_threadDirs[ i ] = dirChar;
+        }
+    }
+};
+
+class JobQueue;
+
+ATTRIBUTE_ALIGNED64(struct) ThreadLocalStorage
+{
+    int m_threadId;
+    WorkerThreadStatus::Type m_status;
+    int m_numJobsFinished;
+    btSpinMutex m_mutex;
+    btScalar m_sumResult;
+    WorkerThreadDirectives * m_directive;
+    JobQueue* m_queue;
+    btClock* m_clock;
+    unsigned int m_cooldownTime;
+};
+
+
+struct IJob
+{
+    virtual void executeJob(int threadId) = 0;
+};
+
+class ParallelForJob : public IJob
+{
+    const btIParallelForBody* m_body;
+    int m_begin;
+    int m_end;
+
+public:
+    ParallelForJob( int iBegin, int iEnd, const btIParallelForBody& body )
+    {
+        m_body = &body;
+        m_begin = iBegin;
+        m_end = iEnd;
+    }
+    virtual void executeJob(int threadId) BT_OVERRIDE
+    {
+        BT_PROFILE( "executeJob" );
+
+        // call the functor body to do the work
+        m_body->forLoop( m_begin, m_end );
+    }
+};
+
+
+class ParallelSumJob : public IJob
+{
+    const btIParallelSumBody* m_body;
+    ThreadLocalStorage* m_threadLocalStoreArray;
+    int m_begin;
+    int m_end;
+
+public:
+    ParallelSumJob( int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalStorage* tls )
+    {
+        m_body = &body;
+        m_threadLocalStoreArray = tls;
+        m_begin = iBegin;
+        m_end = iEnd;
+    }
+    virtual void executeJob( int threadId ) BT_OVERRIDE
+    {
+        BT_PROFILE( "executeJob" );
+
+        // call the functor body to do the work
+        btScalar val = m_body->sumLoop( m_begin, m_end );
+#if BT_PARALLEL_SUM_DETERMINISTISM
+        // by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision)
+        const float TRUNC_SCALE = float(1<<19);
+        val = floor(val*TRUNC_SCALE+0.5f)/TRUNC_SCALE;  // truncate some bits
+#endif
+        m_threadLocalStoreArray[threadId].m_sumResult += val;
+    }
+};
+
+
+ATTRIBUTE_ALIGNED64(class) JobQueue
+{
+    btThreadSupportInterface* m_threadSupport;
+    btCriticalSection* m_queueLock;
+    btSpinMutex m_mutex;
+
+    btAlignedObjectArray<IJob*> m_jobQueue;
+    char* m_jobMem;
+    int m_jobMemSize;
+    bool m_queueIsEmpty;
+    int m_tailIndex;
+    int m_headIndex;
+    int m_allocSize;
+    bool m_useSpinMutex;
+    btAlignedObjectArray<JobQueue*> m_neighborContexts;
+    char m_cachePadding[kCacheLineSize];  // prevent false sharing
+
+    void freeJobMem()
+    {
+        if ( m_jobMem )
+        {
+            // free old
+            btAlignedFree(m_jobMem);
+            m_jobMem = NULL;
+        }
+    }
+    void resizeJobMem(int newSize)
+    {
+        if (newSize > m_jobMemSize)
+        {
+            freeJobMem();
+            m_jobMem = static_cast<char*>(btAlignedAlloc(newSize, kCacheLineSize));
+            m_jobMemSize = newSize;
+        }
+    }
+
+public:
+
+    JobQueue()
+    {
+        m_jobMem = NULL;
+        m_jobMemSize = 0;
+        m_threadSupport = NULL;
+        m_queueLock = NULL;
+        m_headIndex = 0;
+        m_tailIndex = 0;
+        m_useSpinMutex = false;
+    }
+    ~JobQueue()
+    {
+        freeJobMem();
+        if (m_queueLock && m_threadSupport)
+        {
+            m_threadSupport->deleteCriticalSection(m_queueLock);
+            m_queueLock = NULL;
+        }
+    }
+    void init(btThreadSupportInterface* threadSup, btAlignedObjectArray<JobQueue>* contextArray)
+    {
+        m_threadSupport = threadSup;
+        if (threadSup)
+        {
+            m_queueLock = m_threadSupport->createCriticalSection();
+        }
+        setupJobStealing(contextArray, contextArray->size());
+    }
+    void setupJobStealing(btAlignedObjectArray<JobQueue>* contextArray, int numActiveContexts)
+    {
+        btAlignedObjectArray<JobQueue>& contexts = *contextArray;
+        int selfIndex = 0;
+        for (int i = 0; i < contexts.size(); ++i)
+        {
+            if ( this == &contexts[ i ] )
+            {
+                selfIndex = i;
+                break;
+            }
+        }
+        int numNeighbors = btMin(2, contexts.size() - 1);
+        int neighborOffsets[ ] = {-1, 1, -2, 2, -3, 3};
+        int numOffsets = sizeof(neighborOffsets)/sizeof(neighborOffsets[0]);
+        m_neighborContexts.reserve( numNeighbors );
+        m_neighborContexts.resizeNoInitialize(0);
+        for (int i = 0; i < numOffsets && m_neighborContexts.size() < numNeighbors; i++)
+        {
+            int neighborIndex = selfIndex + neighborOffsets[i];
+            if ( neighborIndex >= 0 && neighborIndex < numActiveContexts)
+            {
+                m_neighborContexts.push_back( &contexts[ neighborIndex ] );
+            }
+        }
+    }
+
+    bool isQueueEmpty() const {return m_queueIsEmpty;}
+    void lockQueue()
+    {
+        if ( m_useSpinMutex )
+        {
+            m_mutex.lock();
+        }
+        else
+        {
+            m_queueLock->lock();
+        }
+    }
+    void unlockQueue()
+    {
+        if ( m_useSpinMutex )
+        {
+            m_mutex.unlock();
+        }
+        else
+        {
+            m_queueLock->unlock();
+        }
+    }
+    void clearQueue(int jobCount, int jobSize)
+    {
+        lockQueue();
+        m_headIndex = 0;
+        m_tailIndex = 0;
+        m_allocSize = 0;
+        m_queueIsEmpty = true;
+        int jobBufSize = jobSize * jobCount;
+        // make sure we have enough memory allocated to store jobs
+        if ( jobBufSize > m_jobMemSize )
+        {
+            resizeJobMem( jobBufSize );
+        }
+        // make sure job queue is big enough
+        if ( jobCount > m_jobQueue.capacity() )
+        {
+            m_jobQueue.reserve( jobCount );
+        }
+        unlockQueue();
+        m_jobQueue.resizeNoInitialize( 0 );
+    }
+    void* allocJobMem(int jobSize)
+    {
+        btAssert(m_jobMemSize >= (m_allocSize + jobSize));
+        void* jobMem = &m_jobMem[m_allocSize];
+        m_allocSize += jobSize;
+        return jobMem;
+    }
+    void submitJob( IJob* job )
+    {
+        btAssert( reinterpret_cast<char*>( job ) >= &m_jobMem[ 0 ] && reinterpret_cast<char*>( job ) < &m_jobMem[ 0 ] + m_allocSize );
+        m_jobQueue.push_back( job );
+        lockQueue();
+        m_tailIndex++;
+        m_queueIsEmpty = false;
+        unlockQueue();
+    }
+    IJob* consumeJobFromOwnQueue()
+    {
+        if ( m_queueIsEmpty )
+        {
+            // lock free path. even if this is taken erroneously it isn't harmful
+            return NULL;
+        }
+        IJob* job = NULL;
+        lockQueue();
+        if ( !m_queueIsEmpty )
+        {
+            job = m_jobQueue[ m_headIndex++ ];
+            btAssert( reinterpret_cast<char*>( job ) >= &m_jobMem[ 0 ] && reinterpret_cast<char*>( job ) < &m_jobMem[ 0 ] + m_allocSize );
+            if ( m_headIndex == m_tailIndex )
+            {
+                m_queueIsEmpty = true;
+            }
+        }
+        unlockQueue();
+        return job;
+    }
+    IJob* consumeJob()
+    {
+        if (IJob* job = consumeJobFromOwnQueue())
+        {
+            return job;
+        }
+        // own queue is empty, try to steal from neighbor
+        for (int i = 0; i < m_neighborContexts.size(); ++i)
+        {
+            JobQueue* otherContext = m_neighborContexts[ i ];
+            if ( IJob* job = otherContext->consumeJobFromOwnQueue() )
+            {
+                return job;
+            }
+        }
+        return NULL;
+    }
+};
+
+
+static void WorkerThreadFunc( void* userPtr )
+{
+    BT_PROFILE( "WorkerThreadFunc" );
+    ThreadLocalStorage* localStorage = (ThreadLocalStorage*) userPtr;
+    JobQueue* jobQueue = localStorage->m_queue;
+
+    bool shouldSleep = false;
+    int threadId = localStorage->m_threadId;
+    while (! shouldSleep)
+    {
+        // do work
+        localStorage->m_mutex.lock();
+        while ( IJob* job = jobQueue->consumeJob() )
+        {
+            localStorage->m_status = WorkerThreadStatus::kWorking;
+            job->executeJob( threadId );
+            localStorage->m_numJobsFinished++;
+        }
+        localStorage->m_status = WorkerThreadStatus::kWaitingForWork;
+        localStorage->m_mutex.unlock();
+        btU64 clockStart = localStorage->m_clock->getTimeMicroseconds();
+        // while queue is empty,
+        while (jobQueue->isQueueEmpty())
+        {
+            // todo: spin wait a bit to avoid hammering the empty queue
+            btSpinPause();
+            if ( localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kGoToSleep )
+            {
+                shouldSleep = true;
+                break;
+            }
+            // if jobs are incoming,
+            if ( localStorage->m_directive->getDirective( threadId ) == WorkerThreadDirectives::kScanForJobs )
+            {
+                clockStart = localStorage->m_clock->getTimeMicroseconds(); // reset clock
+            }
+            else
+            {
+                for ( int i = 0; i < 50; ++i )
+                {
+                    btSpinPause();
+                    btSpinPause();
+                    btSpinPause();
+                    btSpinPause();
+                    if (localStorage->m_directive->getDirective( threadId ) == WorkerThreadDirectives::kScanForJobs || !jobQueue->isQueueEmpty())
+                    {
+                        break;
+                    }
+                }
+                // if no jobs incoming and queue has been empty for the cooldown time, sleep
+                btU64 timeElapsed = localStorage->m_clock->getTimeMicroseconds() - clockStart;
+                if (timeElapsed > localStorage->m_cooldownTime)
+                {
+                    shouldSleep = true;
+                    break;
+                }
+            }
+        }
+    }
+
+    // go sleep
+    localStorage->m_mutex.lock();
+    localStorage->m_status = WorkerThreadStatus::kSleeping;
+    localStorage->m_mutex.unlock();
+}
+
+
+class btTaskSchedulerDefault : public btITaskScheduler
+{
+    btThreadSupportInterface* m_threadSupport;
+    WorkerThreadDirectives* m_workerDirective;
+    btAlignedObjectArray<JobQueue> m_jobQueues;
+    btAlignedObjectArray<JobQueue*> m_perThreadJobQueues;
+    btAlignedObjectArray<ThreadLocalStorage> m_threadLocalStorage;
+    btSpinMutex m_antiNestingLock;  // prevent nested parallel-for
+    btClock m_clock;
+    int m_numThreads;
+    int m_numWorkerThreads;
+    int m_numActiveJobQueues;
+    int m_maxNumThreads;
+    int m_numJobs;
+    static const int kFirstWorkerThreadId = 1;
+public:
+
+    btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport")
+    {
+        m_threadSupport = NULL;
+        m_workerDirective = NULL;
+    }
+
+    virtual ~btTaskSchedulerDefault()
+    {
+        waitForWorkersToSleep();
+        if (m_threadSupport)
+        {
+            delete m_threadSupport;
+            m_threadSupport = NULL;
+        }
+        if (m_workerDirective)
+        {
+            btAlignedFree(m_workerDirective);
+            m_workerDirective = NULL;
+        }
+    }
+
+    void init()
+    {
+        btThreadSupportInterface::ConstructionInfo constructionInfo( "TaskScheduler", WorkerThreadFunc );
+        m_threadSupport = btThreadSupportInterface::create( constructionInfo );
+        m_workerDirective = static_cast<WorkerThreadDirectives*>(btAlignedAlloc(sizeof(*m_workerDirective), 64));
+
+        m_numWorkerThreads = m_threadSupport->getNumWorkerThreads();
+        m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1;
+        m_numThreads = m_maxNumThreads;
+        // ideal to have one job queue for each physical processor (except for the main thread which needs no queue)
+        int numThreadsPerQueue = m_threadSupport->getLogicalToPhysicalCoreRatio();
+        int numJobQueues = (numThreadsPerQueue == 1) ? (m_maxNumThreads-1) : (m_maxNumThreads / numThreadsPerQueue);
+        m_jobQueues.resize(numJobQueues);
+        m_numActiveJobQueues = numJobQueues;
+        for ( int i = 0; i < m_jobQueues.size(); ++i )
+        {
+            m_jobQueues[i].init( m_threadSupport, &m_jobQueues );
+        }
+        m_perThreadJobQueues.resize(m_numThreads);
+        for ( int i = 0; i < m_numThreads; i++ )
+        {
+            JobQueue* jq = NULL;
+            // only worker threads get a job queue
+            if (i > 0)
+            {
+                if (numThreadsPerQueue == 1)
+                {
+                    // one queue per worker thread
+                    jq = &m_jobQueues[ i - kFirstWorkerThreadId ];
+                }
+                else
+                {
+                    // 2 threads share each queue
+                    jq = &m_jobQueues[ i / numThreadsPerQueue ];
+                }
+            }
+            m_perThreadJobQueues[i] = jq;
+        }
+        m_threadLocalStorage.resize(m_numThreads);
+        for ( int i = 0; i < m_numThreads; i++ )
+        {
+            ThreadLocalStorage& storage = m_threadLocalStorage[i];
+            storage.m_threadId = i;
+            storage.m_directive = m_workerDirective;
+            storage.m_status = WorkerThreadStatus::kSleeping;
+            storage.m_cooldownTime = 1000; // 1000 microseconds, threads go to sleep after this long if they have nothing to do
+            storage.m_clock = &m_clock;
+            storage.m_queue = m_perThreadJobQueues[i];
+        }
+        setWorkerDirectives( WorkerThreadDirectives::kGoToSleep ); // no work for them yet
+        setNumThreads( m_threadSupport->getCacheFriendlyNumThreads() );
+    }
+
+    void setWorkerDirectives(WorkerThreadDirectives::Type dir)
+    {
+        m_workerDirective->setDirectiveByRange(kFirstWorkerThreadId, m_numThreads, dir);
+    }
+
+    virtual int getMaxNumThreads() const BT_OVERRIDE
+    {
+        return m_maxNumThreads;
+    }
+
+    virtual int getNumThreads() const BT_OVERRIDE
+    {
+        return m_numThreads;
+    }
+
+    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
+    {
+        m_numThreads = btMax( btMin(numThreads, int(m_maxNumThreads)), 1 );
+        m_numWorkerThreads = m_numThreads - 1;
+        m_numActiveJobQueues = 0;
+        // if there is at least 1 worker,
+        if ( m_numWorkerThreads > 0 )
+        {
+            // re-setup job stealing between queues to avoid attempting to steal from an inactive job queue
+            JobQueue* lastActiveContext = m_perThreadJobQueues[ m_numThreads - 1 ];
+            int iLastActiveContext = lastActiveContext - &m_jobQueues[0];
+            m_numActiveJobQueues = iLastActiveContext + 1;
+            for ( int i = 0; i < m_jobQueues.size(); ++i )
+            {
+                m_jobQueues[ i ].setupJobStealing( &m_jobQueues, m_numActiveJobQueues );
+            }
+        }
+        m_workerDirective->setDirectiveByRange(m_numThreads, BT_MAX_THREAD_COUNT, WorkerThreadDirectives::kGoToSleep);
+    }
+
+    void waitJobs()
+    {
+        BT_PROFILE( "waitJobs" );
+        // have the main thread work until the job queues are empty
+        int numMainThreadJobsFinished = 0;
+        for ( int i = 0; i < m_numActiveJobQueues; ++i )
+        {
+            while ( IJob* job = m_jobQueues[i].consumeJob() )
+            {
+                job->executeJob( 0 );
+                numMainThreadJobsFinished++;
+            }
+        }
+
+        // done with jobs for now, tell workers to rest (but not sleep)
+        setWorkerDirectives( WorkerThreadDirectives::kStayAwakeButIdle );
+
+        btU64 clockStart = m_clock.getTimeMicroseconds();
+        // wait for workers to finish any jobs in progress
+        while ( true )
+        {
+            int numWorkerJobsFinished = 0;
+            for ( int iThread = kFirstWorkerThreadId; iThread < m_numThreads; ++iThread )
+            {
+                ThreadLocalStorage* storage = &m_threadLocalStorage[iThread];
+                storage->m_mutex.lock();
+                numWorkerJobsFinished += storage->m_numJobsFinished;
+                storage->m_mutex.unlock();
+            }
+            if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs)
+            {
+                break;
+            }
+            btU64 timeElapsed = m_clock.getTimeMicroseconds() - clockStart;
+            btAssert(timeElapsed < 1000);
+            if (timeElapsed > 100000)
+            {
+                break;
+            }
+            btSpinPause();
+        }
+    }
+
+    void wakeWorkers(int numWorkersToWake)
+    {
+        BT_PROFILE( "wakeWorkers" );
+        btAssert( m_workerDirective->getDirective(1) == WorkerThreadDirectives::kScanForJobs );
+        int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads);
+        int numActiveWorkers = 0;
+        for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker )
+        {
+            // note this count of active workers is not necessarily totally reliable, because a worker thread could be
+            // just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare.
+            ThreadLocalStorage& storage = m_threadLocalStorage[ kFirstWorkerThreadId + iWorker ];
+            if (storage.m_status != WorkerThreadStatus::kSleeping)
+            {
+                numActiveWorkers++;
+            }
+        }
+        for ( int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker )
+        {
+            ThreadLocalStorage& storage = m_threadLocalStorage[ kFirstWorkerThreadId + iWorker ];
+            if (storage.m_status == WorkerThreadStatus::kSleeping)
+            {
+                m_threadSupport->runTask( iWorker, &storage );
+                numActiveWorkers++;
+            }
+        }
+    }
+
+    void waitForWorkersToSleep()
+    {
+        BT_PROFILE( "waitForWorkersToSleep" );
+        setWorkerDirectives( WorkerThreadDirectives::kGoToSleep );
+        m_threadSupport->waitForAllTasks();
+        for ( int i = kFirstWorkerThreadId; i < m_numThreads; i++ )
+        {
+            ThreadLocalStorage& storage = m_threadLocalStorage[i];
+            btAssert( storage.m_status == WorkerThreadStatus::kSleeping );
+        }
+    }
+
+    virtual void sleepWorkerThreadsHint() BT_OVERRIDE
+    {
+        BT_PROFILE( "sleepWorkerThreadsHint" );
+        // hint the task scheduler that we may not be using these threads for a little while
+        setWorkerDirectives( WorkerThreadDirectives::kGoToSleep );
+    }
+
+    void prepareWorkerThreads()
+    {
+        for ( int i = kFirstWorkerThreadId; i < m_numThreads; ++i )
+        {
+            ThreadLocalStorage& storage = m_threadLocalStorage[i];
+            storage.m_mutex.lock();
+            storage.m_numJobsFinished = 0;
+            storage.m_mutex.unlock();
+        }
+        setWorkerDirectives( WorkerThreadDirectives::kScanForJobs );
+    }
+
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelFor_ThreadSupport" );
+        btAssert( iEnd >= iBegin );
+        btAssert( grainSize >= 1 );
+        int iterationCount = iEnd - iBegin;
+        if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() )
+        {
+            typedef ParallelForJob JobType;
+            int jobCount = ( iterationCount + grainSize - 1 ) / grainSize;
+            m_numJobs = jobCount;
+            btAssert( jobCount >= 2 );  // need more than one job for multithreading
+            int jobSize = sizeof( JobType );
+
+            for (int i = 0; i < m_numActiveJobQueues; ++i)
+            {
+                m_jobQueues[i].clearQueue( jobCount, jobSize );
+            }
+            // prepare worker threads for incoming work
+            prepareWorkerThreads();
+            // submit all of the jobs
+            int iJob = 0;
+            int iThread = kFirstWorkerThreadId;  // first worker thread
+            for ( int i = iBegin; i < iEnd; i += grainSize )
+            {
+                btAssert( iJob < jobCount );
+                int iE = btMin( i + grainSize, iEnd );
+                JobQueue* jq = m_perThreadJobQueues[ iThread ];
+                btAssert(jq);
+                btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues);
+                void* jobMem = jq->allocJobMem(jobSize);
+                JobType* job = new ( jobMem ) ParallelForJob( i, iE, body );  // placement new
+                jq->submitJob( job );
+                iJob++;
+                iThread++;
+                if ( iThread >= m_numThreads )
+                {
+                    iThread = kFirstWorkerThreadId;  // first worker thread
+                }
+            }
+            wakeWorkers( jobCount - 1 );
+
+            // put the main thread to work on emptying the job queue and then wait for all workers to finish
+            waitJobs();
+            m_antiNestingLock.unlock();
+        }
+        else
+        {
+            BT_PROFILE( "parallelFor_mainThread" );
+            // just run on main thread
+            body.forLoop( iBegin, iEnd );
+        }
+    }
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelSum_ThreadSupport" );
+        btAssert( iEnd >= iBegin );
+        btAssert( grainSize >= 1 );
+        int iterationCount = iEnd - iBegin;
+        if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() )
+        {
+            typedef ParallelSumJob JobType;
+            int jobCount = ( iterationCount + grainSize - 1 ) / grainSize;
+            m_numJobs = jobCount;
+            btAssert( jobCount >= 2 );  // need more than one job for multithreading
+            int jobSize = sizeof( JobType );
+            for (int i = 0; i < m_numActiveJobQueues; ++i)
+            {
+                m_jobQueues[i].clearQueue( jobCount, jobSize );
+            }
+
+            // initialize summation
+            for ( int iThread = 0; iThread < m_numThreads; ++iThread )
+            {
+                m_threadLocalStorage[iThread].m_sumResult = btScalar(0);
+            }
+
+            // prepare worker threads for incoming work
+            prepareWorkerThreads();
+            // submit all of the jobs
+            int iJob = 0;
+            int iThread = kFirstWorkerThreadId;  // first worker thread
+            for ( int i = iBegin; i < iEnd; i += grainSize )
+            {
+                btAssert( iJob < jobCount );
+                int iE = btMin( i + grainSize, iEnd );
+                JobQueue* jq = m_perThreadJobQueues[ iThread ];
+                btAssert(jq);
+                btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues);
+                void* jobMem = jq->allocJobMem(jobSize);
+                JobType* job = new ( jobMem ) ParallelSumJob( i, iE, body, &m_threadLocalStorage[0] );  // placement new
+                jq->submitJob( job );
+                iJob++;
+                iThread++;
+                if ( iThread >= m_numThreads )
+                {
+                    iThread = kFirstWorkerThreadId;  // first worker thread
+                }
+            }
+            wakeWorkers( jobCount - 1 );
+
+            // put the main thread to work on emptying the job queue and then wait for all workers to finish
+            waitJobs();
+
+            // add up all the thread sums
+            btScalar sum = btScalar(0);
+            for ( int iThread = 0; iThread < m_numThreads; ++iThread )
+            {
+                sum += m_threadLocalStorage[ iThread ].m_sumResult;
+            }
+            m_antiNestingLock.unlock();
+            return sum;
+        }
+        else
+        {
+            BT_PROFILE( "parallelSum_mainThread" );
+            // just run on main thread
+            return body.sumLoop( iBegin, iEnd );
+        }
+    }
+};
+
+
+
+btITaskScheduler* btCreateDefaultTaskScheduler()
+{
+    btTaskSchedulerDefault* ts = new btTaskSchedulerDefault();
+    ts->init();
+    return ts;
+}
+
+#else // #if BT_THREADSAFE
+
+btITaskScheduler* btCreateDefaultTaskScheduler()
+{
+    return NULL;
+}
+
+#endif // #else // #if BT_THREADSAFE
--- a/src/LinearMath/TaskScheduler/btThreadSupportInterface.h
+++ b/src/LinearMath/TaskScheduler/btThreadSupportInterface.h
@@ -0,0 +1,70 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2018 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_THREAD_SUPPORT_INTERFACE_H
+#define BT_THREAD_SUPPORT_INTERFACE_H
+
+
+
+class btCriticalSection
+{
+public:
+    btCriticalSection() {}
+    virtual ~btCriticalSection() {}
+
+    virtual void lock() = 0;
+    virtual void unlock() = 0;
+};
+
+
+class btThreadSupportInterface
+{
+public:
+
+    virtual ~btThreadSupportInterface() {}
+
+    virtual int getNumWorkerThreads() const = 0;  // number of worker threads (total number of logical processors - 1)
+    virtual int getCacheFriendlyNumThreads() const = 0;  // the number of logical processors sharing a single L3 cache
+    virtual int getLogicalToPhysicalCoreRatio() const = 0;  // the number of logical processors per physical processor (usually 1 or 2)
+    virtual void runTask( int threadIndex, void* userData ) = 0;
+    virtual void waitForAllTasks() = 0;
+
+    virtual btCriticalSection* createCriticalSection() = 0;
+    virtual void deleteCriticalSection( btCriticalSection* criticalSection ) = 0;
+
+    typedef void( *ThreadFunc )( void* userPtr );
+
+    struct ConstructionInfo
+    {
+        ConstructionInfo( const char* uniqueName,
+            ThreadFunc userThreadFunc,
+            int threadStackSize = 65535
+        )
+            :m_uniqueName( uniqueName ),
+            m_userThreadFunc( userThreadFunc ),
+            m_threadStackSize( threadStackSize )
+        {
+        }
+
+        const char*     m_uniqueName;
+        ThreadFunc      m_userThreadFunc;
+        int             m_threadStackSize;
+    };
+
+    static btThreadSupportInterface* create( const ConstructionInfo& info );
+};
+
+#endif //BT_THREAD_SUPPORT_INTERFACE_H
+
--- a/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp
+++ b/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp
@@ -0,0 +1,364 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2018 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#if BT_THREADSAFE && !defined( _WIN32 )
+
+
+#include "LinearMath/btScalar.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "LinearMath/btThreads.h"
+#include "LinearMath/btMinMax.h"
+#include "btThreadSupportInterface.h"
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+
+
+#ifndef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 600 //for definition of pthread_barrier_t, see http://pages.cs.wisc.edu/~travitch/pthreads_primer.html
+#endif //_XOPEN_SOURCE
+#include <pthread.h>
+#include <semaphore.h>
+#include <unistd.h>   //for sysconf
+
+
+///
+/// getNumHardwareThreads()
+///
+///
+/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
+///
+#if __cplusplus >= 201103L
+
+#include <thread>
+
+int btGetNumHardwareThreads()
+{
+    return std::thread::hardware_concurrency();
+}
+
+#else
+
+int btGetNumHardwareThreads()
+{
+    return sysconf( _SC_NPROCESSORS_ONLN );
+}
+
+#endif
+
+
+// btThreadSupportPosix helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+class btThreadSupportPosix : public btThreadSupportInterface
+{
+public:
+    struct btThreadStatus
+    {
+        int m_taskId;
+        int m_commandId;
+        int m_status;
+
+        ThreadFunc m_userThreadFunc;
+        void* m_userPtr; //for taskDesc etc
+
+        pthread_t thread;
+        //each tread will wait until this signal to start its work
+        sem_t* startSemaphore;
+
+        // this is a copy of m_mainSemaphore, 
+        //each tread will signal once it is finished with its work
+        sem_t* m_mainSemaphore;
+        unsigned long threadUsed;
+    };
+private:
+    typedef unsigned long long UINT64;
+
+    btAlignedObjectArray<btThreadStatus> m_activeThreadStatus;
+    // m_mainSemaphoresemaphore will signal, if and how many threads are finished with their work
+    sem_t* m_mainSemaphore;
+    int m_numThreads;
+    UINT64 m_startedThreadsMask;
+    void startThreads( const ConstructionInfo& threadInfo );
+    void stopThreads();
+    int waitForResponse();
+
+public:
+    btThreadSupportPosix( const ConstructionInfo& threadConstructionInfo );
+    virtual ~btThreadSupportPosix();
+
+    virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; }
+    // TODO: return the number of logical processors sharing the first L3 cache
+    virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return m_numThreads + 1; }
+    // TODO: detect if CPU has hyperthreading enabled
+    virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return 1; }
+
+    virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE;
+    virtual void waitForAllTasks() BT_OVERRIDE;
+
+    virtual btCriticalSection* createCriticalSection() BT_OVERRIDE;
+    virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE;
+};
+
+
+#define checkPThreadFunction(returnValue) \
+    if(0 != returnValue) { \
+        printf("PThread problem at line %i in file %s: %i %d\n", __LINE__, __FILE__, returnValue, errno); \
+    }
+
+// The number of threads should be equal to the number of available cores
+// Todo: each worker should be linked to a single core, using SetThreadIdealProcessor.
+
+
+btThreadSupportPosix::btThreadSupportPosix( const ConstructionInfo& threadConstructionInfo )
+{
+    startThreads( threadConstructionInfo );
+}
+
+// cleanup/shutdown Libspe2
+btThreadSupportPosix::~btThreadSupportPosix()
+{
+    stopThreads();
+}
+
+#if (defined (__APPLE__))
+#define NAMED_SEMAPHORES
+#endif
+
+
+static sem_t* createSem( const char* baseName )
+{
+    static int semCount = 0;
+#ifdef NAMED_SEMAPHORES
+    /// Named semaphore begin
+    char name[ 32 ];
+    snprintf( name, 32, "/%8.s-%4.d-%4.4d", baseName, getpid(), semCount++ );
+    sem_t* tempSem = sem_open( name, O_CREAT, 0600, 0 );
+
+    if ( tempSem != reinterpret_cast<sem_t *>( SEM_FAILED ) )
+    {
+        //        printf("Created \"%s\" Semaphore %p\n", name, tempSem);
+    }
+    else
+    {
+        //printf("Error creating Semaphore %d\n", errno);
+        exit( -1 );
+    }
+    /// Named semaphore end
+#else
+    sem_t* tempSem = new sem_t;
+    checkPThreadFunction( sem_init( tempSem, 0, 0 ) );
+#endif
+    return tempSem;
+}
+
+static void destroySem( sem_t* semaphore )
+{
+#ifdef NAMED_SEMAPHORES
+    checkPThreadFunction( sem_close( semaphore ) );
+#else
+    checkPThreadFunction( sem_destroy( semaphore ) );
+    delete semaphore;
+#endif
+}
+
+static void *threadFunction( void *argument )
+{
+    btThreadSupportPosix::btThreadStatus* status = ( btThreadSupportPosix::btThreadStatus* )argument;
+
+    while ( 1 )
+    {
+        checkPThreadFunction( sem_wait( status->startSemaphore ) );
+        void* userPtr = status->m_userPtr;
+
+        if ( userPtr )
+        {
+            btAssert( status->m_status );
+            status->m_userThreadFunc( userPtr );
+            status->m_status = 2;
+            checkPThreadFunction( sem_post( status->m_mainSemaphore ) );
+            status->threadUsed++;
+        }
+        else
+        {
+            //exit Thread
+            status->m_status = 3;
+            checkPThreadFunction( sem_post( status->m_mainSemaphore ) );
+            printf( "Thread with taskId %i exiting\n", status->m_taskId );
+            break;
+        }
+    }
+
+    printf( "Thread TERMINATED\n" );
+}
+
+///send messages to SPUs
+void btThreadSupportPosix::runTask( int threadIndex, void* userData )
+{
+    ///we should spawn an SPU task here, and in 'waitForResponse' it should wait for response of the (one of) the first tasks that finished
+    btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ];
+    btAssert( threadIndex >= 0 );
+    btAssert( threadIndex < m_activeThreadStatus.size() );
+
+    threadStatus.m_commandId = 1;
+    threadStatus.m_status = 1;
+    threadStatus.m_userPtr = userData;
+    m_startedThreadsMask |= UINT64( 1 ) << threadIndex;
+
+    // fire event to start new task
+    checkPThreadFunction( sem_post( threadStatus.startSemaphore ) );
+}
+
+
+///check for messages from SPUs
+int btThreadSupportPosix::waitForResponse()
+{
+    ///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response
+    ///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback'
+
+    btAssert( m_activeThreadStatus.size() );
+
+    // wait for any of the threads to finish
+    checkPThreadFunction( sem_wait( m_mainSemaphore ) );
+    // get at least one thread which has finished
+    size_t last = -1;
+
+    for ( size_t t = 0; t < size_t( m_activeThreadStatus.size() ); ++t )
+    {
+        if ( 2 == m_activeThreadStatus[ t ].m_status )
+        {
+            last = t;
+            break;
+        }
+    }
+
+    btThreadStatus& threadStatus = m_activeThreadStatus[ last ];
+
+    btAssert( threadStatus.m_status > 1 );
+    threadStatus.m_status = 0;
+
+    // need to find an active spu
+    btAssert( last >= 0 );
+    m_startedThreadsMask &= ~( UINT64( 1 ) << last );
+
+    return last;
+}
+
+
+void btThreadSupportPosix::waitForAllTasks()
+{
+    while ( m_startedThreadsMask )
+    {
+        waitForResponse();
+    }
+}
+
+
+void btThreadSupportPosix::startThreads( const ConstructionInfo& threadConstructionInfo )
+{
+    m_numThreads = btGetNumHardwareThreads() - 1;  // main thread exists already
+    printf( "%s creating %i threads.\n", __FUNCTION__, m_numThreads );
+    m_activeThreadStatus.resize( m_numThreads );
+    m_startedThreadsMask = 0;
+
+    m_mainSemaphore = createSem( "main" );
+    //checkPThreadFunction(sem_wait(mainSemaphore));
+
+    for ( int i = 0; i < m_numThreads; i++ )
+    {
+        printf( "starting thread %d\n", i );
+        btThreadStatus& threadStatus = m_activeThreadStatus[ i ];
+        threadStatus.startSemaphore = createSem( "threadLocal" );
+        checkPThreadFunction( pthread_create( &threadStatus.thread, NULL, &threadFunction, (void*) &threadStatus ) );
+
+        threadStatus.m_userPtr = 0;
+        threadStatus.m_taskId = i;
+        threadStatus.m_commandId = 0;
+        threadStatus.m_status = 0;
+        threadStatus.m_mainSemaphore = m_mainSemaphore;
+        threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
+        threadStatus.threadUsed = 0;
+
+        printf( "started thread %d \n", i );
+    }
+}
+
+///tell the task scheduler we are done with the SPU tasks
+void btThreadSupportPosix::stopThreads()
+{
+    for ( size_t t = 0; t < size_t( m_activeThreadStatus.size() ); ++t )
+    {
+        btThreadStatus& threadStatus = m_activeThreadStatus[ t ];
+        printf( "%s: Thread %i used: %ld\n", __FUNCTION__, int( t ), threadStatus.threadUsed );
+
+        threadStatus.m_userPtr = 0;
+        checkPThreadFunction( sem_post( threadStatus.startSemaphore ) );
+        checkPThreadFunction( sem_wait( m_mainSemaphore ) );
+
+        printf( "destroy semaphore\n" );
+        destroySem( threadStatus.startSemaphore );
+        printf( "semaphore destroyed\n" );
+        checkPThreadFunction( pthread_join( threadStatus.thread, 0 ) );
+
+    }
+    printf( "destroy main semaphore\n" );
+    destroySem( m_mainSemaphore );
+    printf( "main semaphore destroyed\n" );
+    m_activeThreadStatus.clear();
+}
+
+class btCriticalSectionPosix : public btCriticalSection
+{
+    pthread_mutex_t m_mutex;
+
+public:
+    btCriticalSectionPosix()
+    {
+        pthread_mutex_init( &m_mutex, NULL );
+    }
+    virtual ~btCriticalSectionPosix()
+    {
+        pthread_mutex_destroy( &m_mutex );
+    }
+
+    virtual void lock()
+    {
+        pthread_mutex_lock( &m_mutex );
+    }
+    virtual void unlock()
+    {
+        pthread_mutex_unlock( &m_mutex );
+    }
+};
+
+
+btCriticalSection* btThreadSupportPosix::createCriticalSection()
+{
+    return new btCriticalSectionPosix();
+}
+
+void btThreadSupportPosix::deleteCriticalSection( btCriticalSection* cs )
+{
+    delete cs;
+}
+
+
+btThreadSupportInterface* btThreadSupportInterface::create( const ConstructionInfo& info )
+{
+    return new btThreadSupportPosix( info );
+}
+
+#endif // BT_THREADSAFE && !defined( _WIN32 )
+
--- a/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp
+++ b/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp
@@ -0,0 +1,472 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2018 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#if defined( _WIN32 ) &&  BT_THREADSAFE
+
+#include "LinearMath/btScalar.h"
+#include "LinearMath/btMinMax.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "LinearMath/btThreads.h"
+#include "btThreadSupportInterface.h"
+#include <windows.h>
+#include <stdio.h>
+
+
+struct btProcessorInfo
+{
+    int numLogicalProcessors;
+    int numCores;
+    int numNumaNodes;
+    int numL1Cache;
+    int numL2Cache;
+    int numL3Cache;
+    int numPhysicalPackages;
+    static const int maxNumTeamMasks = 32;
+    int numTeamMasks;
+    UINT64 processorTeamMasks[ maxNumTeamMasks ];
+};
+
+UINT64 getProcessorTeamMask( const btProcessorInfo& procInfo, int procId )
+{
+    UINT64 procMask = UINT64( 1 ) << procId;
+    for ( int i = 0; i < procInfo.numTeamMasks; ++i )
+    {
+        if ( procMask & procInfo.processorTeamMasks[ i ] )
+        {
+            return procInfo.processorTeamMasks[ i ];
+        }
+    }
+    return 0;
+}
+
+int getProcessorTeamIndex( const btProcessorInfo& procInfo, int procId )
+{
+    UINT64 procMask = UINT64( 1 ) << procId;
+    for ( int i = 0; i < procInfo.numTeamMasks; ++i )
+    {
+        if ( procMask & procInfo.processorTeamMasks[ i ] )
+        {
+            return i;
+        }
+    }
+    return -1;
+}
+
+int countSetBits( ULONG64 bits )
+{
+    int count = 0;
+    while ( bits )
+    {
+        if ( bits & 1 )
+        {
+            count++;
+        }
+        bits >>= 1;
+    }
+    return count;
+}
+
+
+typedef BOOL( WINAPI *Pfn_GetLogicalProcessorInformation )( PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD );
+
+
+void getProcessorInformation( btProcessorInfo* procInfo )
+{
+    memset( procInfo, 0, sizeof( *procInfo ) );
+    Pfn_GetLogicalProcessorInformation getLogicalProcInfo =
+        (Pfn_GetLogicalProcessorInformation) GetProcAddress( GetModuleHandle( TEXT( "kernel32" ) ), "GetLogicalProcessorInformation" );
+    if ( getLogicalProcInfo == NULL )
+    {
+        // no info
+        return;
+    }
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buf = NULL;
+    DWORD bufSize = 0;
+    while ( true )
+    {
+        if ( getLogicalProcInfo( buf, &bufSize ) )
+        {
+            break;
+        }
+        else
+        {
+            if ( GetLastError() == ERROR_INSUFFICIENT_BUFFER )
+            {
+                if ( buf )
+                {
+                    free( buf );
+                }
+                buf = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION) malloc( bufSize );
+            }
+        }
+    }
+
+    int len = bufSize / sizeof( *buf );
+    for ( int i = 0; i < len; ++i )
+    {
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION info = buf + i;
+        switch ( info->Relationship )
+        {
+        case RelationNumaNode:
+            procInfo->numNumaNodes++;
+            break;
+
+        case RelationProcessorCore:
+            procInfo->numCores++;
+            procInfo->numLogicalProcessors += countSetBits( info->ProcessorMask );
+            break;
+
+        case RelationCache:
+            if ( info->Cache.Level == 1 )
+            {
+                procInfo->numL1Cache++;
+            }
+            else if ( info->Cache.Level == 2 )
+            {
+                procInfo->numL2Cache++;
+            }
+            else if ( info->Cache.Level == 3 )
+            {
+                procInfo->numL3Cache++;
+                // processors that share L3 cache are considered to be on the same team
+                // because they can more easily work together on the same data.
+                // Large performance penalties will occur if 2 or more threads from different
+                // teams attempt to frequently read and modify the same cache lines.
+                //
+                // On the AMD Ryzen 7 CPU for example, the 8 cores on the CPU are split into
+                // 2 CCX units of 4 cores each. Each CCX has a separate L3 cache, so if both
+                // CCXs are operating on the same data, many cycles will be spent keeping the
+                // two caches coherent.
+                if ( procInfo->numTeamMasks < btProcessorInfo::maxNumTeamMasks )
+                {
+                    procInfo->processorTeamMasks[ procInfo->numTeamMasks ] = info->ProcessorMask;
+                    procInfo->numTeamMasks++;
+                }
+            }
+            break;
+
+        case RelationProcessorPackage:
+            procInfo->numPhysicalPackages++;
+            break;
+        }
+    }
+    free( buf );
+}
+
+
+
+///btThreadSupportWin32 helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+class btThreadSupportWin32 : public btThreadSupportInterface
+{
+public:
+    struct btThreadStatus
+    {
+        int m_taskId;
+        int m_commandId;
+        int m_status;
+
+        ThreadFunc m_userThreadFunc;
+        void* m_userPtr; //for taskDesc etc
+
+        void* m_threadHandle; //this one is calling 'Win32ThreadFunc'
+
+        void* m_eventStartHandle;
+        char m_eventStartHandleName[ 32 ];
+
+        void* m_eventCompleteHandle;
+        char m_eventCompleteHandleName[ 32 ];
+    };
+
+private:
+    btAlignedObjectArray<btThreadStatus> m_activeThreadStatus;
+    btAlignedObjectArray<void*> m_completeHandles;
+    int m_numThreads;
+    DWORD_PTR m_startedThreadMask;
+    btProcessorInfo m_processorInfo;
+
+    void startThreads( const ConstructionInfo& threadInfo );
+    void stopThreads();
+    int waitForResponse();
+
+public:
+
+    btThreadSupportWin32( const ConstructionInfo& threadConstructionInfo );
+    virtual ~btThreadSupportWin32();
+
+    virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; }
+    virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return countSetBits(m_processorInfo.processorTeamMasks[0]); }
+    virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return m_processorInfo.numLogicalProcessors / m_processorInfo.numCores; }
+
+    virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE;
+    virtual void waitForAllTasks() BT_OVERRIDE;
+
+    virtual btCriticalSection* createCriticalSection() BT_OVERRIDE;
+    virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE;
+};
+
+
+btThreadSupportWin32::btThreadSupportWin32( const ConstructionInfo & threadConstructionInfo )
+{
+    startThreads( threadConstructionInfo );
+}
+
+
+btThreadSupportWin32::~btThreadSupportWin32()
+{
+    stopThreads();
+}
+
+
+DWORD WINAPI win32threadStartFunc( LPVOID lpParam )
+{
+    btThreadSupportWin32::btThreadStatus* status = ( btThreadSupportWin32::btThreadStatus* )lpParam;
+
+    while ( 1 )
+    {
+        WaitForSingleObject( status->m_eventStartHandle, INFINITE );
+        void* userPtr = status->m_userPtr;
+
+        if ( userPtr )
+        {
+            btAssert( status->m_status );
+            status->m_userThreadFunc( userPtr );
+            status->m_status = 2;
+            SetEvent( status->m_eventCompleteHandle );
+        }
+        else
+        {
+            //exit Thread
+            status->m_status = 3;
+            printf( "Thread with taskId %i with handle %p exiting\n", status->m_taskId, status->m_threadHandle );
+            SetEvent( status->m_eventCompleteHandle );
+            break;
+        }
+    }
+    printf( "Thread TERMINATED\n" );
+    return 0;
+}
+
+
+void btThreadSupportWin32::runTask( int threadIndex, void* userData )
+{
+    btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ];
+    btAssert( threadIndex >= 0 );
+    btAssert( int( threadIndex ) < m_activeThreadStatus.size() );
+
+    threadStatus.m_commandId = 1;
+    threadStatus.m_status = 1;
+    threadStatus.m_userPtr = userData;
+    m_startedThreadMask |= DWORD_PTR( 1 ) << threadIndex;
+
+    ///fire event to start new task
+    SetEvent( threadStatus.m_eventStartHandle );
+}
+
+
+int btThreadSupportWin32::waitForResponse()
+{
+    btAssert( m_activeThreadStatus.size() );
+
+    int last = -1;
+    DWORD res = WaitForMultipleObjects( m_completeHandles.size(), &m_completeHandles[ 0 ], FALSE, INFINITE );
+    btAssert( res != WAIT_FAILED );
+    last = res - WAIT_OBJECT_0;
+
+    btThreadStatus& threadStatus = m_activeThreadStatus[ last ];
+    btAssert( threadStatus.m_threadHandle );
+    btAssert( threadStatus.m_eventCompleteHandle );
+
+    //WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE);
+    btAssert( threadStatus.m_status > 1 );
+    threadStatus.m_status = 0;
+
+    ///need to find an active spu
+    btAssert( last >= 0 );
+    m_startedThreadMask &= ~( DWORD_PTR( 1 ) << last );
+
+    return last;
+}
+
+
+void btThreadSupportWin32::waitForAllTasks()
+{
+    while ( m_startedThreadMask )
+    {
+        waitForResponse();
+    }
+}
+
+
+void btThreadSupportWin32::startThreads( const ConstructionInfo& threadConstructionInfo )
+{
+    static int uniqueId = 0;
+    uniqueId++;
+    btProcessorInfo& procInfo = m_processorInfo;
+    getProcessorInformation( &procInfo );
+    DWORD_PTR dwProcessAffinityMask = 0;
+    DWORD_PTR dwSystemAffinityMask = 0;
+    if ( !GetProcessAffinityMask( GetCurrentProcess(), &dwProcessAffinityMask, &dwSystemAffinityMask ) )
+    {
+        dwProcessAffinityMask = 0;
+    }
+    ///The number of threads should be equal to the number of available cores - 1
+    m_numThreads = btMin(procInfo.numLogicalProcessors, int(BT_MAX_THREAD_COUNT)) - 1; // cap to max thread count (-1 because main thread already exists)
+
+    m_activeThreadStatus.resize( m_numThreads );
+    m_completeHandles.resize( m_numThreads );
+    m_startedThreadMask = 0;
+
+    // set main thread affinity
+    if ( DWORD_PTR mask = dwProcessAffinityMask & getProcessorTeamMask( procInfo, 0 ))
+    {
+        SetThreadAffinityMask( GetCurrentThread(), mask );
+        SetThreadIdealProcessor( GetCurrentThread(), 0 );
+    }
+
+    for ( int i = 0; i < m_numThreads; i++ )
+    {
+        printf( "starting thread %d\n", i );
+
+        btThreadStatus& threadStatus = m_activeThreadStatus[ i ];
+
+        LPSECURITY_ATTRIBUTES lpThreadAttributes = NULL;
+        SIZE_T dwStackSize = threadConstructionInfo.m_threadStackSize;
+        LPTHREAD_START_ROUTINE lpStartAddress = &win32threadStartFunc;
+        LPVOID lpParameter = &threadStatus;
+        DWORD dwCreationFlags = 0;
+        LPDWORD lpThreadId = 0;
+
+        threadStatus.m_userPtr = 0;
+
+        sprintf( threadStatus.m_eventStartHandleName, "es%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i );
+        threadStatus.m_eventStartHandle = CreateEventA( 0, false, false, threadStatus.m_eventStartHandleName );
+
+        sprintf( threadStatus.m_eventCompleteHandleName, "ec%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i );
+        threadStatus.m_eventCompleteHandle = CreateEventA( 0, false, false, threadStatus.m_eventCompleteHandleName );
+
+        m_completeHandles[ i ] = threadStatus.m_eventCompleteHandle;
+
+        HANDLE handle = CreateThread( lpThreadAttributes, dwStackSize, lpStartAddress, lpParameter, dwCreationFlags, lpThreadId );
+        //SetThreadPriority( handle, THREAD_PRIORITY_HIGHEST );
+        // highest priority -- can cause erratic performance when numThreads > numCores
+        //                     we don't want worker threads to be higher priority than the main thread or the main thread could get
+        //                     totally shut out and unable to tell the workers to stop
+        //SetThreadPriority( handle, THREAD_PRIORITY_BELOW_NORMAL );
+
+        {
+            int processorId = i + 1;  // leave processor 0 for main thread
+            DWORD_PTR teamMask = getProcessorTeamMask( procInfo, processorId );
+            if ( teamMask )
+            {
+                // bind each thread to only execute on processors of it's assigned team
+                //  - for single-socket Intel x86 CPUs this has no effect (only a single, shared L3 cache so there is only 1 team)
+                //  - for multi-socket Intel this will keep threads from migrating from one socket to another
+                //  - for AMD Ryzen this will keep threads from migrating from one CCX to another
+                DWORD_PTR mask = teamMask & dwProcessAffinityMask;
+                if ( mask )
+                {
+                    SetThreadAffinityMask( handle, mask );
+                }
+            }
+            SetThreadIdealProcessor( handle, processorId );
+        }
+
+        threadStatus.m_taskId = i;
+        threadStatus.m_commandId = 0;
+        threadStatus.m_status = 0;
+        threadStatus.m_threadHandle = handle;
+        threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
+
+        printf( "started %s thread %d with threadHandle %p\n", threadConstructionInfo.m_uniqueName, i, handle );
+    }
+}
+
+///tell the task scheduler we are done with the SPU tasks
+void btThreadSupportWin32::stopThreads()
+{
+    for ( int i = 0; i < m_activeThreadStatus.size(); i++ )
+    {
+        btThreadStatus& threadStatus = m_activeThreadStatus[ i ];
+        if ( threadStatus.m_status > 0 )
+        {
+            WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE );
+        }
+
+        threadStatus.m_userPtr = NULL;
+        SetEvent( threadStatus.m_eventStartHandle );
+        WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE );
+
+        CloseHandle( threadStatus.m_eventCompleteHandle );
+        CloseHandle( threadStatus.m_eventStartHandle );
+        CloseHandle( threadStatus.m_threadHandle );
+
+    }
+
+    m_activeThreadStatus.clear();
+    m_completeHandles.clear();
+}
+
+
+class btWin32CriticalSection : public btCriticalSection
+{
+private:
+    CRITICAL_SECTION mCriticalSection;
+
+public:
+    btWin32CriticalSection()
+    {
+        InitializeCriticalSection( &mCriticalSection );
+    }
+
+    ~btWin32CriticalSection()
+    {
+        DeleteCriticalSection( &mCriticalSection );
+    }
+
+    void lock()
+    {
+        EnterCriticalSection( &mCriticalSection );
+    }
+
+    void unlock()
+    {
+        LeaveCriticalSection( &mCriticalSection );
+    }
+};
+
+
+btCriticalSection* btThreadSupportWin32::createCriticalSection()
+{
+    unsigned char* mem = (unsigned char*) btAlignedAlloc( sizeof( btWin32CriticalSection ), 16 );
+    btWin32CriticalSection* cs = new( mem ) btWin32CriticalSection();
+    return cs;
+}
+
+void btThreadSupportWin32::deleteCriticalSection( btCriticalSection* criticalSection )
+{
+    criticalSection->~btCriticalSection();
+    btAlignedFree( criticalSection );
+}
+
+
+btThreadSupportInterface* btThreadSupportInterface::create( const ConstructionInfo& info )
+{
+    return new btThreadSupportWin32( info );
+}
+
+
+
+#endif //defined(_WIN32) && BT_THREADSAFE
+
--- a/src/LinearMath/btThreads.cpp
+++ b/src/LinearMath/btThreads.cpp
@@ -453,6 +453,33 @@ void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBod
 #endif// #if BT_THREADSAFE
 }

+btScalar btParallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body )
+{
+#if BT_THREADSAFE
+
+#if BT_DETECT_BAD_THREAD_INDEX
+    if ( !btThreadsAreRunning() )
+    {
+        // clear out thread ids
+        for ( int i = 0; i < BT_MAX_THREAD_COUNT; ++i )
+        {
+            gDebugThreadIds[ i ] = kInvalidThreadId;
+        }
+    }
+#endif // #if BT_DETECT_BAD_THREAD_INDEX
+
+    btAssert( gBtTaskScheduler != NULL );  // call btSetTaskScheduler() with a valid task scheduler first!
+    return gBtTaskScheduler->parallelSum( iBegin, iEnd, grainSize, body );
+
+#else // #if BT_THREADSAFE
+
+    // non-parallel version of btParallelSum
+    btAssert( !"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE" );
+    return body.sumLoop( iBegin, iEnd );
+
+#endif //#else // #if BT_THREADSAFE
+}
+

 ///
 /// btTaskSchedulerSequential -- non-threaded implementation of task scheduler
@@ -470,6 +497,11 @@ public:
        BT_PROFILE( "parallelFor_sequential" );
        body.forLoop( iBegin, iEnd );
    }
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelSum_sequential" );
+        return body.sumLoop( iBegin, iEnd );
+    }
 };


@@ -514,11 +546,25 @@ public:
 #pragma omp parallel for schedule( static, 1 )
        for ( int i = iBegin; i < iEnd; i += grainSize )
        {
-            BT_PROFILE( "OpenMP_job" );
+            BT_PROFILE( "OpenMP_forJob" );
            body.forLoop( i, ( std::min )( i + grainSize, iEnd ) );
        }
        btPopThreadsAreRunning();
    }
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelFor_OpenMP" );
+        btPushThreadsAreRunning();
+        btScalar sum = btScalar( 0 );
+#pragma omp parallel for schedule( static, 1 ) reduction(+:sum)
+        for ( int i = iBegin; i < iEnd; i += grainSize )
+        {
+            BT_PROFILE( "OpenMP_sumJob" );
+            sum += body.sumLoop( i, ( std::min )( i + grainSize, iEnd ) );
+        }
+        btPopThreadsAreRunning();
+        return sum;
+    }
 };
 #endif // #if BT_USE_OPENMP && BT_THREADSAFE

@@ -571,22 +617,21 @@ public:
            btResetThreadIndexCounter();
        }
    }
-    struct BodyAdapter
+    struct ForBodyAdapter
    {
        const btIParallelForBody* mBody;

+        ForBodyAdapter( const btIParallelForBody* body ) : mBody( body ) {}
        void operator()( const tbb::blocked_range<int>& range ) const
        {
-            BT_PROFILE( "TBB_job" );
+            BT_PROFILE( "TBB_forJob" );
            mBody->forLoop( range.begin(), range.end() );
        }
    };
    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
    {
        BT_PROFILE( "parallelFor_TBB" );
-        // TBB dispatch
-        BodyAdapter tbbBody;
-        tbbBody.mBody = &body;
+        ForBodyAdapter tbbBody( &body );
        btPushThreadsAreRunning();
        tbb::parallel_for( tbb::blocked_range<int>( iBegin, iEnd, grainSize ),
            tbbBody,
@@ -594,6 +639,29 @@ public:
        );
        btPopThreadsAreRunning();
    }
+    struct SumBodyAdapter
+    {
+        const btIParallelSumBody* mBody;
+        btScalar mSum;
+
+        SumBodyAdapter( const btIParallelSumBody* body ) : mBody( body ), mSum( btScalar( 0 ) ) {}
+        SumBodyAdapter( const SumBodyAdapter& src, tbb::split ) : mBody( src.mBody ), mSum( btScalar( 0 ) ) {}
+        void join( const SumBodyAdapter& src ) { mSum += src.mSum; }
+        void operator()( const tbb::blocked_range<int>& range )
+        {
+            BT_PROFILE( "TBB_sumJob" );
+            mSum += mBody->sumLoop( range.begin(), range.end() );
+        }
+    };
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelSum_TBB" );
+        SumBodyAdapter tbbBody( &body );
+        btPushThreadsAreRunning();
+        tbb::parallel_deterministic_reduce( tbb::blocked_range<int>( iBegin, iEnd, grainSize ), tbbBody );
+        btPopThreadsAreRunning();
+        return tbbBody.mSum;
+    }
 };
 #endif // #if BT_USE_TBB && BT_THREADSAFE

@@ -605,6 +673,7 @@ public:
 class btTaskSchedulerPPL : public btITaskScheduler
 {
    int m_numThreads;
+    concurrency::combinable<btScalar> m_sum;  // for parallelSum
 public:
    btTaskSchedulerPPL() : btITaskScheduler( "PPL" )
    {
@@ -644,15 +713,16 @@ public:
            btResetThreadIndexCounter();
        }
    }
-    struct BodyAdapter
+    struct ForBodyAdapter
    {
        const btIParallelForBody* mBody;
        int mGrainSize;
        int mIndexEnd;

+        ForBodyAdapter( const btIParallelForBody* body, int grainSize, int end ) : mBody( body ), mGrainSize( grainSize ), mIndexEnd( end ) {}
        void operator()( int i ) const
        {
-            BT_PROFILE( "PPL_job" );
+            BT_PROFILE( "PPL_forJob" );
            mBody->forLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) );
        }
    };
@@ -660,10 +730,7 @@ public:
    {
        BT_PROFILE( "parallelFor_PPL" );
        // PPL dispatch
-        BodyAdapter pplBody;
-        pplBody.mBody = &body;
-        pplBody.mGrainSize = grainSize;
-        pplBody.mIndexEnd = iEnd;
+        ForBodyAdapter pplBody( &body, grainSize, iEnd );
        btPushThreadsAreRunning();
        // note: MSVC 2010 doesn't support partitioner args, so avoid them
        concurrency::parallel_for( iBegin,
@@ -673,6 +740,36 @@ public:
        );
        btPopThreadsAreRunning();
    }
+    struct SumBodyAdapter
+    {
+        const btIParallelSumBody* mBody;
+        concurrency::combinable<btScalar>* mSum;
+        int mGrainSize;
+        int mIndexEnd;
+
+        SumBodyAdapter( const btIParallelSumBody* body, concurrency::combinable<btScalar>* sum, int grainSize, int end ) : mBody( body ), mSum(sum), mGrainSize( grainSize ), mIndexEnd( end ) {}
+        void operator()( int i ) const
+        {
+            BT_PROFILE( "PPL_sumJob" );
+            mSum->local() += mBody->sumLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) );
+        }
+    };
+    static btScalar sumFunc( btScalar a, btScalar b ) { return a + b; }
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelSum_PPL" );
+        m_sum.clear();
+        SumBodyAdapter pplBody( &body, &m_sum, grainSize, iEnd );
+        btPushThreadsAreRunning();
+        // note: MSVC 2010 doesn't support partitioner args, so avoid them
+        concurrency::parallel_for( iBegin,
+            iEnd,
+            grainSize,
+            pplBody
+        );
+        btPopThreadsAreRunning();
+        return m_sum.combine( sumFunc );
+    }
 };
 #endif // #if BT_USE_PPL && BT_THREADSAFE

--- a/src/LinearMath/btThreads.h
+++ b/src/LinearMath/btThreads.h
@@ -107,6 +107,17 @@ public:
    virtual void forLoop( int iBegin, int iEnd ) const = 0;
 };

+//
+// btIParallelSumBody -- subclass this to express work that can be done in parallel
+//                       and produces a sum over all loop elements
+//
+class btIParallelSumBody
+{
+public:
+    virtual ~btIParallelSumBody() {}
+    virtual btScalar sumLoop( int iBegin, int iEnd ) const = 0;
+};
+
 //
 // btITaskScheduler -- subclass this to implement a task scheduler that can dispatch work to
 //                     worker threads
@@ -122,6 +133,8 @@ public:
    virtual int getNumThreads() const = 0;
    virtual void setNumThreads( int numThreads ) = 0;
    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) = 0;
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) = 0;
+    virtual void sleepWorkerThreadsHint() {}  // hint the task scheduler that we may not be using these threads for a little while

    // internal use only
    virtual void activate();
@@ -143,6 +156,9 @@ btITaskScheduler* btGetTaskScheduler();
 // get non-threaded task scheduler (always available)
 btITaskScheduler* btGetSequentialTaskScheduler();

+// create a default task scheduler (Win32 or pthreads based)
+btITaskScheduler* btCreateDefaultTaskScheduler();
+
 // get OpenMP task scheduler (if available, otherwise returns null)
 btITaskScheduler* btGetOpenMPTaskScheduler();

@@ -156,5 +172,9 @@ btITaskScheduler* btGetPPLTaskScheduler();
 //                 (iterations may be done out of order, so no dependencies are allowed)
 void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body );

+// btParallelSum -- call this to dispatch work like a for-loop, returns the sum of all iterations
+//                 (iterations may be done out of order, so no dependencies are allowed)
+btScalar btParallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body );
+

 #endif
--- a/src/LinearMath/premake4.lua
+++ b/src/LinearMath/premake4.lua
@@ -9,5 +9,7 @@
 	}
 	files {
 		"*.cpp",
-		"*.h"
+		"*.h",
+		"TaskScheduler/*.cpp",
+		"TaskScheduler/*.h"
 	}