From b8720f2161ad5a27693341b9ef47ec2380a4a972 Mon Sep 17 00:00:00 2001
From: Lunkhound <lunkhound@gmail.com>
Date: Sun, 4 Jun 2017 17:57:25 -0700
Subject: [PATCH 1/8] parallel solver: various changes  - threading: adding
 btSequentialImpulseConstraintSolverMt  - task scheduler: added parallelSum so
 that parallel solver can compute residuals  - CommonRigidBodyMTBase: add
 slider for solver least squares residual and allow multithreading without
 needing OpenMP, TBB, or PPL  - taskScheduler: don't wait for workers to
 sleep/signal at the end of each parallel block  - parallel solver:
 convertContacts split into an allocContactConstraints and
 setupContactConstraints stage, the latter of which is done in parallel  -
 parallel solver: rolling friction is now interleaved along with normal
 friction  - parallel solver: batchified split impulse solving + some cleanup 
 - parallel solver: sorting batches from largest to smallest  - parallel
 solver: added parallel batch creation  - parallel solver: added
 warmstartingWriteBackContacts func + other cleanup  - task scheduler:
 truncate low bits to preserve determinism with parallelSum  - parallel
 solver: reducing dynamic mem allocs and trying to parallelize more of the
 batch setup  - parallel solver: parallelize updating constraint batch ids for
 merging  - parallel solver: adding debug visualization  - task scheduler:
 make TBB task scheduler parallelSum deterministic  - parallel solver: split
 batch gen code into separate file; allow selection of batch gen method  -
 task scheduler: add sleepWorkerThreadsHint() at end of simulation  - parallel
 solver: added grain size per phase  - task Scheduler: fix for strange
 threading issue; also no need for main thread to wait for workers to sleep  -
 base constraint solver: break out joint setup into separate function for
 profiling/overriding  - parallel solver: allow different batching method for
 contacts vs joints  - base constraint solver: add convertJoint and
 convertBodies to make it possible to parallelize joint and body conversion  -
 parallel solver: convert joints and bodies in parallel now  - parallel
 solver: speed up batch creation with run-length encoding  - parallel solver:
 batch gen: run-length expansion in parallel; collect constraint info in
 parallel  - parallel solver: adding spatial grid batching method  - parallel
 solver: enhancements to spatial grid batching  - sequential solver: moving
 code for writing back into functions that derived classes can call  -
 parallel solver: do write back of bodies and joints in parallel  - parallel
 solver: removed all batching methods except for spatial grid (others were
 ineffective)  - parallel solver: added 2D or 3D grid batching options; and a
 bit of cleanup  - move btDefaultTaskScheduler into LinearMath project

---
 CMakeLists.txt                                |   13 +-
 examples/ExampleBrowser/CMakeLists.txt        |    1 -
 .../CommonRigidBodyMTBase.cpp                 |  235 ++-
 .../MultiThreadedDemo/CommonRigidBodyMTBase.h |    2 +
 .../MultiThreadedDemo/MultiThreadedDemo.cpp   |   31 +-
 examples/MultiThreading/btTaskScheduler.cpp   |  448 -----
 examples/MultiThreading/btTaskScheduler.h     |   26 -
 src/BulletDynamics/CMakeLists.txt             |    3 +
 .../ConstraintSolver/btBatchedConstraints.cpp | 1129 ++++++++++++
 .../ConstraintSolver/btBatchedConstraints.h   |   66 +
 .../btSequentialImpulseConstraintSolver.cpp   |  530 +++---
 .../btSequentialImpulseConstraintSolver.h     |    8 +-
 .../btSequentialImpulseConstraintSolverMt.cpp | 1611 +++++++++++++++++
 .../btSequentialImpulseConstraintSolverMt.h   |  154 ++
 .../Dynamics/btDiscreteDynamicsWorldMt.cpp    |   11 +
 .../Dynamics/btDiscreteDynamicsWorldMt.h      |    2 +
 .../Dynamics/btSimulationIslandManagerMt.cpp  |   43 +-
 .../Dynamics/btSimulationIslandManagerMt.h    |    2 +
 src/LinearMath/CMakeLists.txt                 |    4 +
 .../TaskScheduler/btTaskScheduler.cpp         |  619 +++++++
 .../TaskScheduler/btThreadSupportInterface.h  |   75 +
 .../TaskScheduler/btThreadSupportPosix.cpp    |  369 ++++
 .../TaskScheduler/btThreadSupportWin32.cpp    |  480 +++++
 src/LinearMath/btThreads.cpp                  |  121 +-
 src/LinearMath/btThreads.h                    |   20 +
 25 files changed, 5236 insertions(+), 767 deletions(-)
 delete mode 100644 examples/MultiThreading/btTaskScheduler.cpp
 delete mode 100644 examples/MultiThreading/btTaskScheduler.h
 create mode 100644 src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
 create mode 100644 src/BulletDynamics/ConstraintSolver/btBatchedConstraints.h
 create mode 100644 src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
 create mode 100644 src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h
 create mode 100644 src/LinearMath/TaskScheduler/btTaskScheduler.cpp
 create mode 100644 src/LinearMath/TaskScheduler/btThreadSupportInterface.h
 create mode 100644 src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp
 create mode 100644 src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c14c02640..2a951d25a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,14 +28,14 @@ OPTION(USE_GRAPHICAL_BENCHMARK "Use Graphical Benchmark" ON)
 OPTION(BUILD_SHARED_LIBS "Use shared libraries" OFF)
 OPTION(USE_SOFT_BODY_MULTI_BODY_DYNAMICS_WORLD "Use btSoftMultiBodyDynamicsWorld" ON)
 
-OPTION(BULLET2_USE_THREAD_LOCKS "Build Bullet 2 libraries with mutex locking around certain operations (required for multi-threading)" OFF)
-IF (BULLET2_USE_THREAD_LOCKS)
+OPTION(BULLET2_MULTITHREADING "Build Bullet 2 libraries with mutex locking around certain operations (required for multi-threading)" OFF)
+IF (BULLET2_MULTITHREADING)
     OPTION(BULLET2_USE_OPEN_MP_MULTITHREADING "Build Bullet 2 with support for multi-threading with OpenMP (requires a compiler with OpenMP support)" OFF)
     OPTION(BULLET2_USE_TBB_MULTITHREADING "Build Bullet 2 with support for multi-threading with Intel Threading Building Blocks (requires the TBB library to be already installed)" OFF)
     IF (MSVC)
         OPTION(BULLET2_USE_PPL_MULTITHREADING "Build Bullet 2 with support for multi-threading with Microsoft Parallel Patterns Library (requires MSVC compiler)" OFF)
     ENDIF (MSVC)
-ENDIF (BULLET2_USE_THREAD_LOCKS)
+ENDIF (BULLET2_MULTITHREADING)
 
 
 IF(NOT WIN32)
@@ -225,12 +225,15 @@ IF(USE_GRAPHICAL_BENCHMARK)
 ADD_DEFINITIONS( -DUSE_GRAPHICAL_BENCHMARK)
 ENDIF (USE_GRAPHICAL_BENCHMARK)
 
-IF(BULLET2_USE_THREAD_LOCKS)
+IF(BULLET2_MULTITHREADING)
 	ADD_DEFINITIONS( -DBT_THREADSAFE=1 )
 	IF (NOT MSVC)
 		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 	ENDIF (NOT MSVC)
-ENDIF (BULLET2_USE_THREAD_LOCKS)
+	IF (NOT WIN32)
+		LINK_LIBRARIES( pthread )
+	ENDIF (NOT WIN32)
+ENDIF (BULLET2_MULTITHREADING)
 
 IF (BULLET2_USE_OPEN_MP_MULTITHREADING)
     ADD_DEFINITIONS("-DBT_USE_OPENMP=1")
diff --git a/examples/ExampleBrowser/CMakeLists.txt b/examples/ExampleBrowser/CMakeLists.txt
index cd50c7cf2..6bccf0d5c 100644
--- a/examples/ExampleBrowser/CMakeLists.txt
+++ b/examples/ExampleBrowser/CMakeLists.txt
@@ -226,7 +226,6 @@ SET(BulletExampleBrowser_SRCS
 	../MultiThreading/b3PosixThreadSupport.cpp
 	../MultiThreading/b3Win32ThreadSupport.cpp
 	../MultiThreading/b3ThreadSupportInterface.cpp
-	../MultiThreading/btTaskScheduler.cpp
 	../RenderingExamples/TinyRendererSetup.cpp
 	../RenderingExamples/TimeSeriesCanvas.cpp
 	../RenderingExamples/TimeSeriesCanvas.h
diff --git a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
index 1cf21db1e..b11cd7691 100644
--- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
+++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
@@ -29,17 +29,17 @@ class btCollisionShape;
 #include "BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h"
 #include "BulletDynamics/Dynamics/btSimulationIslandManagerMt.h"  // for setSplitIslands()
 #include "BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h"
+#include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h"
 #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h"
 #include "BulletDynamics/ConstraintSolver/btNNCGConstraintSolver.h"
 #include "BulletDynamics/MLCPSolvers/btMLCPSolver.h"
 #include "BulletDynamics/MLCPSolvers/btSolveProjectedGaussSeidel.h"
 #include "BulletDynamics/MLCPSolvers/btDantzigSolver.h"
 #include "BulletDynamics/MLCPSolvers/btLemkeSolver.h"
-#include "../MultiThreading/btTaskScheduler.h"
 
 
 static int gNumIslands = 0;
-
+bool gAllowNestedParallelForLoops = false;
 
 class Profiler
 {
@@ -52,6 +52,10 @@ public:
         kRecordPredictUnconstrainedMotion,
         kRecordCreatePredictiveContacts,
         kRecordIntegrateTransforms,
+        kRecordSolverTotal,
+        kRecordSolverSetup,
+        kRecordSolverIterations,
+        kRecordSolverFinish,
         kRecordCount
     };
 
@@ -139,6 +143,41 @@ static void profileEndCallback( btDynamicsWorld *world, btScalar timeStep )
 }
 
 
+class MySequentialImpulseConstraintSolverMt : public btSequentialImpulseConstraintSolverMt
+{
+    typedef btSequentialImpulseConstraintSolverMt ParentClass;
+public:
+    BT_DECLARE_ALIGNED_ALLOCATOR();
+	
+	MySequentialImpulseConstraintSolverMt() {}
+
+    // for profiling
+	virtual btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE
+    {
+        ProfileHelper prof(Profiler::kRecordSolverSetup);
+        btScalar ret = ParentClass::solveGroupCacheFriendlySetup(bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer );
+        return ret;
+    }
+    virtual btScalar solveGroupCacheFriendlyIterations( btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& infoGlobal, btIDebugDraw* debugDrawer ) BT_OVERRIDE
+    {
+        ProfileHelper prof(Profiler::kRecordSolverIterations);
+        btScalar ret = ParentClass::solveGroupCacheFriendlyIterations(bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer );
+        return ret;
+    }
+    virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal) BT_OVERRIDE
+    {
+        ProfileHelper prof(Profiler::kRecordSolverFinish);
+        btScalar ret = ParentClass::solveGroupCacheFriendlyFinish(bodies, numBodies, infoGlobal);
+        return ret;
+    }
+    virtual btScalar solveGroup(btCollisionObject** bodies, int numBodies, btPersistentManifold** manifold, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btDispatcher* dispatcher) BT_OVERRIDE
+    {
+        ProfileHelper prof(Profiler::kRecordSolverTotal);
+        btScalar ret = ParentClass::solveGroup(bodies, numBodies, manifold, numManifolds, constraints, numConstraints, info, debugDrawer, dispatcher);
+        return ret;
+    }
+};
+
 ///
 /// MyCollisionDispatcher -- subclassed for profiling purposes
 ///
@@ -218,6 +257,8 @@ btConstraintSolver* createSolverByType( SolverType t )
     {
     case SOLVER_TYPE_SEQUENTIAL_IMPULSE:
         return new btSequentialImpulseConstraintSolver();
+    case SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT:
+        return new MySequentialImpulseConstraintSolverMt();
     case SOLVER_TYPE_NNCG:
         return new btNNCGConstraintSolver();
     case SOLVER_TYPE_MLCP_PGS:
@@ -253,7 +294,7 @@ public:
     {
         addTaskScheduler( btGetSequentialTaskScheduler() );
 #if BT_THREADSAFE
-        if ( btITaskScheduler* ts = createDefaultTaskScheduler() )
+        if ( btITaskScheduler* ts = btCreateDefaultTaskScheduler() )
         {
             m_allocatedTaskSchedulers.push_back( ts );
             addTaskScheduler( ts );
@@ -310,7 +351,7 @@ static bool gDisplayProfileInfo = true;
 static bool gMultithreadedWorld = false;
 static bool gDisplayProfileInfo = false;
 #endif
-static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE;
+static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT;
 static int gSolverMode = SOLVER_SIMD |
                         SOLVER_USE_WARMSTARTING |
                         // SOLVER_RANDMIZE_ORDER |
@@ -318,9 +359,11 @@ static int gSolverMode = SOLVER_SIMD |
                         // SOLVER_USE_2_FRICTION_DIRECTIONS |
                         0;
 static btScalar gSliderSolverIterations = 10.0f; // should be int
-
 static btScalar gSliderNumThreads = 1.0f;  // should be int
-
+static btScalar gSliderIslandBatchingThreshold = 0.0f; // should be int
+static btScalar gSliderMinBatchSize = btScalar(btSequentialImpulseConstraintSolverMt::s_minBatchSize); // should be int
+static btScalar gSliderMaxBatchSize = btScalar(btSequentialImpulseConstraintSolverMt::s_maxBatchSize); // should be int
+static btScalar gSliderLeastSquaresResidualThreshold = 0.0f;
 
 ////////////////////////////////////
 CommonRigidBodyMTBase::CommonRigidBodyMTBase( struct GUIHelperInterface* helper )
@@ -419,6 +462,23 @@ void setTaskSchedulerComboBoxCallback(int combobox, const char* item, void* user
 }
 
 
+void setBatchingMethodComboBoxCallback(int combobox, const char* item, void* userPointer)
+{
+#if BT_THREADSAFE
+    const char** items = static_cast<const char**>( userPointer );
+    for ( int i = 0; i < btBatchedConstraints::BATCHING_METHOD_COUNT; ++i )
+    {
+        if ( strcmp( item, items[ i ] ) == 0 )
+        {
+            // change the task scheduler
+            btSequentialImpulseConstraintSolverMt::s_contactBatchingMethod = static_cast<btBatchedConstraints::BatchingMethod>( i );
+            break;
+        }
+    }
+#endif // #if BT_THREADSAFE
+}
+
+
 static void setThreadCountCallback(float val, void* userPtr)
 {
 #if BT_THREADSAFE
@@ -435,13 +495,43 @@ static void setSolverIterationCountCallback(float val, void* userPtr)
     }
 }
 
+static void setLargeIslandManifoldCountCallback( float val, void* userPtr )
+{
+    btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching = int( gSliderIslandBatchingThreshold );
+}
+
+static void setMinBatchSizeCallback( float val, void* userPtr )
+{
+    gSliderMaxBatchSize = (std::max)(gSliderMinBatchSize, gSliderMaxBatchSize);
+    btSequentialImpulseConstraintSolverMt::s_minBatchSize = int(gSliderMinBatchSize);
+    btSequentialImpulseConstraintSolverMt::s_maxBatchSize = int(gSliderMaxBatchSize);
+}
+
+static void setMaxBatchSizeCallback( float val, void* userPtr )
+{
+    gSliderMinBatchSize = (std::min)(gSliderMinBatchSize, gSliderMaxBatchSize);
+    btSequentialImpulseConstraintSolverMt::s_minBatchSize = int(gSliderMinBatchSize);
+    btSequentialImpulseConstraintSolverMt::s_maxBatchSize = int(gSliderMaxBatchSize);
+}
+
+static void setLeastSquaresResidualThresholdCallback( float val, void* userPtr )
+{
+    if (btDiscreteDynamicsWorld* world = reinterpret_cast<btDiscreteDynamicsWorld*>(userPtr))
+    {
+        world->getSolverInfo().m_leastSquaresResidualThreshold = gSliderLeastSquaresResidualThreshold;
+    }
+}
+
 void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
 {
     gNumIslands = 0;
     m_solverType = gSolverType;
-#if BT_THREADSAFE && (BT_USE_OPENMP || BT_USE_PPL || BT_USE_TBB)
+#if BT_THREADSAFE
     btAssert( btGetTaskScheduler() != NULL );
-    m_multithreadCapable = true;
+    if (NULL != btGetTaskScheduler() && gTaskSchedulerMgr.getNumTaskSchedulers() > 1)
+    {
+        m_multithreadCapable = true;
+    }
 #endif
     if ( gMultithreadedWorld )
     {
@@ -486,7 +576,12 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
 
         m_broadphase = new btDbvtBroadphase();
 
-        m_solver = createSolverByType( m_solverType );
+        SolverType solverType = m_solverType;
+        if ( solverType == SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT )
+        {
+            solverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE;
+        }
+        m_solver = createSolverByType( solverType );
 
         m_dynamicsWorld = new btDiscreteDynamicsWorld( m_dispatcher, m_broadphase, m_solver, m_collisionConfiguration );
     }
@@ -494,6 +589,7 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
     m_dynamicsWorld->setInternalTickCallback( profileEndCallback, NULL, false );
     m_dynamicsWorld->setGravity( btVector3( 0, -10, 0 ) );
     m_dynamicsWorld->getSolverInfo().m_solverMode = gSolverMode;
+   	m_dynamicsWorld->getSolverInfo().m_numIterations = btMax(1, int(gSliderSolverIterations));
     createDefaultParameters();
 }
 
@@ -504,16 +600,18 @@ void CommonRigidBodyMTBase::createDefaultParameters()
     {
         // create a button to toggle multithreaded world
         ButtonParams button( "Multithreaded world enable", 0, true );
-        button.m_initialState = gMultithreadedWorld;
-        button.m_userPointer = &gMultithreadedWorld;
+        bool* ptr = &gMultithreadedWorld;
+        button.m_initialState = *ptr;
+        button.m_userPointer = ptr;
         button.m_callback = boolPtrButtonCallback;
         m_guiHelper->getParameterInterface()->registerButtonParameter( button );
     }
     {
         // create a button to toggle profile printing
         ButtonParams button( "Display solver info", 0, true );
-        button.m_initialState = gDisplayProfileInfo;
-        button.m_userPointer = &gDisplayProfileInfo;
+        bool* ptr = &gDisplayProfileInfo;
+        button.m_initialState = *ptr;
+        button.m_userPointer = ptr;
         button.m_callback = boolPtrButtonCallback;
         m_guiHelper->getParameterInterface()->registerButtonParameter( button );
     }
@@ -544,6 +642,16 @@ void CommonRigidBodyMTBase::createDefaultParameters()
         slider.m_clampToIntegers = true;
         m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
     }
+    {
+        // a slider for the solver leastSquaresResidualThreshold (used to run fewer solver iterations when convergence is good)
+        SliderParams slider( "Solver residual thresh", &gSliderLeastSquaresResidualThreshold );
+        slider.m_minVal = 0.0f;
+        slider.m_maxVal = 0.25f;
+        slider.m_callback = setLeastSquaresResidualThresholdCallback;
+        slider.m_userPointer = m_dynamicsWorld;
+        slider.m_clampToIntegers = false;
+        m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
+    }
     {
         ButtonParams button( "Solver use SIMD", 0, true );
         button.m_buttonId = SOLVER_SIMD;
@@ -618,20 +726,86 @@ void CommonRigidBodyMTBase::createDefaultParameters()
             m_guiHelper->getParameterInterface()->registerComboBox( comboParams );
         }
         {
-            // create a slider to set the number of threads to use
-            int numThreads = btGetTaskScheduler()->getNumThreads();
             // if slider has not been set yet (by another demo),
             if ( gSliderNumThreads <= 1.0f )
             {
+                // create a slider to set the number of threads to use
+                int numThreads = btGetTaskScheduler()->getNumThreads();
                 gSliderNumThreads = float( numThreads );
             }
+            int maxNumThreads = btGetTaskScheduler()->getMaxNumThreads();
 			SliderParams slider("Thread count", &gSliderNumThreads);
 			slider.m_minVal = 1.0f;
-			slider.m_maxVal = float( BT_MAX_THREAD_COUNT );
+			slider.m_maxVal = float( maxNumThreads );
 			slider.m_callback = setThreadCountCallback;
             slider.m_clampToIntegers = true;
             m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
         }
+        {
+            // a slider for the number of manifolds an island needs to be too large for parallel dispatch
+            if (gSliderIslandBatchingThreshold < 1.0)
+            {
+                gSliderIslandBatchingThreshold = float(btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching);
+            }
+            SliderParams slider( "IslandBatchThresh", &gSliderIslandBatchingThreshold );
+            slider.m_minVal = 1.0f;
+            slider.m_maxVal = 2000.0f;
+            slider.m_callback = setLargeIslandManifoldCountCallback;
+            slider.m_userPointer = NULL;
+            slider.m_clampToIntegers = true;
+            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
+        }
+        {
+            // create a combo box for selecting the batching method
+            static const char* sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_COUNT ];
+            {
+                sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_SPATIAL_GRID_2D ] = "Batching: 2D Grid";
+                sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_SPATIAL_GRID_3D ] = "Batching: 3D Grid";
+            };
+            ComboBoxParams comboParams;
+            comboParams.m_userPointer = sBatchingMethodComboBoxItems;
+            comboParams.m_numItems = btBatchedConstraints::BATCHING_METHOD_COUNT;
+            comboParams.m_startItem = static_cast<int>(btSequentialImpulseConstraintSolverMt::s_contactBatchingMethod);
+            comboParams.m_items = sBatchingMethodComboBoxItems;
+            comboParams.m_callback = setBatchingMethodComboBoxCallback;
+            m_guiHelper->getParameterInterface()->registerComboBox( comboParams );
+        }
+        {
+            // a slider for the sequentialImpulseConstraintSolverMt min batch size (when batching)
+            SliderParams slider( "Min batch size", &gSliderMinBatchSize );
+            slider.m_minVal = 1.0f;
+            slider.m_maxVal = 1000.0f;
+            slider.m_callback = setMinBatchSizeCallback;
+            slider.m_userPointer = NULL;
+            slider.m_clampToIntegers = true;
+            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
+        }
+        {
+            // a slider for the sequentialImpulseConstraintSolverMt max batch size (when batching)
+            SliderParams slider( "Max batch size", &gSliderMaxBatchSize );
+            slider.m_minVal = 1.0f;
+            slider.m_maxVal = 1000.0f;
+            slider.m_callback = setMaxBatchSizeCallback;
+            slider.m_userPointer = NULL;
+            slider.m_clampToIntegers = true;
+            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
+        }
+        {
+            // create a button to toggle debug drawing of batching visualization
+            ButtonParams button( "Visualize batching", 0, true );
+            bool* ptr = &btBatchedConstraints::s_debugDrawBatches;
+            button.m_initialState = *ptr;
+            button.m_userPointer = ptr;
+            button.m_callback = boolPtrButtonCallback;
+            m_guiHelper->getParameterInterface()->registerButtonParameter( button );
+        }
+        {
+            ButtonParams button( "Allow Nested ParallelFor", 0, true );
+            button.m_initialState = btSequentialImpulseConstraintSolverMt::s_allowNestedParallelForLoops;
+            button.m_userPointer = &btSequentialImpulseConstraintSolverMt::s_allowNestedParallelForLoops;
+            button.m_callback = boolPtrButtonCallback;
+            m_guiHelper->getParameterInterface()->registerButtonParameter( button );
+        }
 #endif // #if BT_THREADSAFE
     }
 }
@@ -643,6 +817,7 @@ void CommonRigidBodyMTBase::drawScreenText()
     int xCoord = 400;
     int yCoord = 30;
     int yStep = 30;
+    int indent = 30;
     if (m_solverType != gSolverType)
     {
         sprintf( msg, "restart example to change solver type" );
@@ -721,6 +896,34 @@ void CommonRigidBodyMTBase::drawScreenText()
             m_guiHelper->getAppInterface()->drawText( msg, xCoord, yCoord, 0.4f );
             yCoord += yStep;
 
+            sprintf( msg,
+                     "SolverTotal %5.3f ms",
+                     gProfiler.getAverageTime( Profiler::kRecordSolverTotal )*0.001f
+                     );
+            m_guiHelper->getAppInterface()->drawText( msg, xCoord, yCoord, 0.4f );
+            yCoord += yStep;
+
+            sprintf( msg,
+                     "SolverSetup %5.3f ms",
+                     gProfiler.getAverageTime( Profiler::kRecordSolverSetup )*0.001f
+                     );
+            m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f );
+            yCoord += yStep;
+
+            sprintf( msg,
+                     "SolverIterations %5.3f ms",
+                     gProfiler.getAverageTime( Profiler::kRecordSolverIterations )*0.001f
+                     );
+            m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f );
+            yCoord += yStep;
+
+            sprintf( msg,
+                     "SolverFinish %5.3f ms",
+                     gProfiler.getAverageTime( Profiler::kRecordSolverFinish )*0.001f
+                     );
+            m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f );
+            yCoord += yStep;
+
             sprintf( msg,
                      "PredictUnconstrainedMotion %5.3f ms",
                      gProfiler.getAverageTime( Profiler::kRecordPredictUnconstrainedMotion )*0.001f
diff --git a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.h b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.h
index 0695b88c0..c283a3f22 100644
--- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.h
+++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.h
@@ -14,6 +14,7 @@
 enum SolverType
 {
     SOLVER_TYPE_SEQUENTIAL_IMPULSE,
+    SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT,
     SOLVER_TYPE_NNCG,
     SOLVER_TYPE_MLCP_PGS,
     SOLVER_TYPE_MLCP_DANTZIG,
@@ -27,6 +28,7 @@ inline const char* getSolverTypeName( SolverType t )
     switch (t)
     {
     case SOLVER_TYPE_SEQUENTIAL_IMPULSE: return "SequentialImpulse";
+    case SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT: return "SequentialImpulseMt";
     case SOLVER_TYPE_NNCG: return "NNCG";
     case SOLVER_TYPE_MLCP_PGS: return "MLCP ProjectedGaussSeidel";
     case SOLVER_TYPE_MLCP_DANTZIG: return "MLCP Dantzig";
diff --git a/examples/MultiThreadedDemo/MultiThreadedDemo.cpp b/examples/MultiThreadedDemo/MultiThreadedDemo.cpp
index a04ab0d91..3cfcec807 100644
--- a/examples/MultiThreadedDemo/MultiThreadedDemo.cpp
+++ b/examples/MultiThreadedDemo/MultiThreadedDemo.cpp
@@ -25,10 +25,10 @@ subject to the following restrictions:
 
 
 
-static btScalar gSliderStackRows = 8.0f;
-static btScalar gSliderStackColumns = 6.0f;
-static btScalar gSliderStackHeight = 10.0f;
-static btScalar gSliderStackWidth = 1.0f;
+static btScalar gSliderStackRows = 1.0f;
+static btScalar gSliderStackColumns = 1.0f;
+static btScalar gSliderStackHeight = 15.0f;
+static btScalar gSliderStackWidth = 8.0f;
 static btScalar gSliderGroundHorizontalAmplitude = 0.0f;
 static btScalar gSliderGroundVerticalAmplitude = 0.0f;
 static btScalar gSliderGroundTilt = 0.0f;
@@ -75,6 +75,21 @@ public:
         btScalar tilt = gSliderGroundTilt * SIMD_2_PI / 360.0f;
         return btQuaternion( btVector3( 1.0f, 0.0f, 0.0f ), tilt );
     }
+    struct TestSumBody : public btIParallelSumBody
+    {
+        virtual btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+        {
+            btScalar sum = 0.0f;
+            for (int i = iBegin; i < iEnd; ++i)
+            {
+                if (i > 0)
+                {
+                    sum += 1.0f / btScalar(i);
+                }
+            }
+            return sum;
+        }
+    };
     virtual void stepSimulation( float deltaTime ) BT_OVERRIDE
     {
         if ( m_dynamicsWorld )
@@ -115,6 +130,14 @@ public:
             // always step by 1/60 for benchmarking
             m_dynamicsWorld->stepSimulation( 1.0f / 60.0f, 0 );
         }
+#if 0
+        {
+            // test parallelSum
+            TestSumBody testSumBody;
+            float testSum = btParallelSum( 1, 10000000, 10000, testSumBody );
+            printf( "sum = %f\n", testSum );
+        }
+#endif
     }
 
     virtual void initPhysics() BT_OVERRIDE;
diff --git a/examples/MultiThreading/btTaskScheduler.cpp b/examples/MultiThreading/btTaskScheduler.cpp
deleted file mode 100644
index e6862a197..000000000
--- a/examples/MultiThreading/btTaskScheduler.cpp
+++ /dev/null
@@ -1,448 +0,0 @@
-
-#include "LinearMath/btTransform.h"
-#include "../Utils/b3Clock.h"
-#include "LinearMath/btAlignedObjectArray.h"
-#include "LinearMath/btThreads.h"
-#include "LinearMath/btQuickprof.h"
-#include <stdio.h>
-#include <algorithm>
-
-
-typedef void( *btThreadFunc )( void* userPtr, void* lsMemory );
-typedef void* ( *btThreadLocalStorageFunc )();
-
-#if BT_THREADSAFE
-
-#if defined( _WIN32 )
-
-#include "b3Win32ThreadSupport.h"
-
-b3ThreadSupportInterface* createThreadSupport( int numThreads, btThreadFunc threadFunc, btThreadLocalStorageFunc localStoreFunc, const char* uniqueName )
-{
-    b3Win32ThreadSupport::Win32ThreadConstructionInfo constructionInfo( uniqueName, threadFunc, localStoreFunc, numThreads );
-    //constructionInfo.m_priority = 0;  // highest priority (the default) -- can cause erratic performance when numThreads > numCores
-    //                                     we don't want worker threads to be higher priority than the main thread or the main thread could get
-    //                                     totally shut out and unable to tell the workers to stop
-    constructionInfo.m_priority = -1;  // normal priority
-    b3Win32ThreadSupport* threadSupport = new b3Win32ThreadSupport( constructionInfo );
-    return threadSupport;
-}
-
-#else // #if defined( _WIN32 )
-
-#include "b3PosixThreadSupport.h"
-
-b3ThreadSupportInterface* createThreadSupport( int numThreads, btThreadFunc threadFunc, btThreadLocalStorageFunc localStoreFunc, const char* uniqueName)
-{
-    b3PosixThreadSupport::ThreadConstructionInfo constructionInfo( uniqueName, threadFunc, localStoreFunc, numThreads );
-    b3ThreadSupportInterface* threadSupport = new b3PosixThreadSupport( constructionInfo );
-    return threadSupport;
-}
-
-#endif // #else // #if defined( _WIN32 )
-
-
-///
-/// getNumHardwareThreads()
-///
-///
-/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
-///
-#if __cplusplus >= 201103L
-
-#include <thread>
-
-int getNumHardwareThreads()
-{
-    return std::thread::hardware_concurrency();
-}
-
-#elif defined( _WIN32 )
-
-#define WIN32_LEAN_AND_MEAN
-
-#include <windows.h>
-
-int getNumHardwareThreads()
-{
-    // caps out at 32
-    SYSTEM_INFO info;
-    GetSystemInfo( &info );
-    return info.dwNumberOfProcessors;
-}
-
-#else
-
-int getNumHardwareThreads()
-{
-    return 0;  // don't know
-}
-
-#endif
-
-
-struct WorkerThreadStatus
-{
-    enum Type
-    {
-        kInvalid,
-        kWaitingForWork,
-        kWorking,
-        kSleeping,
-    };
-};
-
-
-struct IJob
-{
-    virtual void executeJob() = 0;
-};
-
-class ParallelForJob : public IJob
-{
-    const btIParallelForBody* mBody;
-    int mBegin;
-    int mEnd;
-
-public:
-    ParallelForJob()
-    {
-        mBody = NULL;
-        mBegin = 0;
-        mEnd = 0;
-    }
-    void init( int iBegin, int iEnd, const btIParallelForBody& body )
-    {
-        mBody = &body;
-        mBegin = iBegin;
-        mEnd = iEnd;
-    }
-    virtual void executeJob() BT_OVERRIDE
-    {
-        BT_PROFILE( "executeJob" );
-
-        // call the functor body to do the work
-        mBody->forLoop( mBegin, mEnd );
-    }
-};
-
-
-struct JobContext
-{
-    JobContext()
-    {
-        m_queueLock = NULL;
-        m_headIndex = 0;
-        m_tailIndex = 0;
-        m_workersShouldCheckQueue = false;
-        m_useSpinMutex = false;
-    }
-    b3CriticalSection* m_queueLock;
-    btSpinMutex m_mutex;
-    volatile bool m_workersShouldCheckQueue;
-
-    btAlignedObjectArray<IJob*> m_jobQueue;
-    bool m_queueIsEmpty;
-    int m_tailIndex;
-    int m_headIndex;
-    bool m_useSpinMutex;
-
-    void lockQueue()
-    {
-        if ( m_useSpinMutex )
-        {
-            m_mutex.lock();
-        }
-        else
-        {
-            m_queueLock->lock();
-        }
-    }
-    void unlockQueue()
-    {
-        if ( m_useSpinMutex )
-        {
-            m_mutex.unlock();
-        }
-        else
-        {
-            m_queueLock->unlock();
-        }
-    }
-    void clearQueue()
-    {
-        lockQueue();
-        m_headIndex = 0;
-        m_tailIndex = 0;
-        m_queueIsEmpty = true;
-        unlockQueue();
-        m_jobQueue.resizeNoInitialize( 0 );
-    }
-    void submitJob( IJob* job )
-    {
-        m_jobQueue.push_back( job );
-        lockQueue();
-        m_tailIndex++;
-        m_queueIsEmpty = false;
-        unlockQueue();
-    }
-    IJob* consumeJob()
-    {
-        if ( m_queueIsEmpty )
-        {
-            // lock free path. even if this is taken erroneously it isn't harmful
-            return NULL;
-        }
-        IJob* job = NULL;
-        lockQueue();
-        if ( !m_queueIsEmpty )
-        {
-            job = m_jobQueue[ m_headIndex++ ];
-            if ( m_headIndex == m_tailIndex )
-            {
-                m_queueIsEmpty = true;
-            }
-        }
-        unlockQueue();
-        return job;
-    }
-};
-
-
-struct WorkerThreadLocalStorage
-{
-    int threadId;
-    WorkerThreadStatus::Type status;
-};
-
-
-static void WorkerThreadFunc( void* userPtr, void* lsMemory )
-{
-    BT_PROFILE( "WorkerThreadFunc" );
-    WorkerThreadLocalStorage* localStorage = (WorkerThreadLocalStorage*) lsMemory;
-    localStorage->status = WorkerThreadStatus::kWaitingForWork;
-    //printf( "WorkerThreadFunc: worker %d start working\n", localStorage->threadId );
-
-    JobContext* jobContext = (JobContext*) userPtr;
-
-    while ( jobContext->m_workersShouldCheckQueue )
-    {
-        if ( IJob* job = jobContext->consumeJob() )
-        {
-            localStorage->status = WorkerThreadStatus::kWorking;
-            job->executeJob();
-            localStorage->status = WorkerThreadStatus::kWaitingForWork;
-        }
-        else
-        {
-            // todo: spin wait a bit to avoid hammering the empty queue
-        }
-    }
-
-    //printf( "WorkerThreadFunc stop working\n" );
-    localStorage->status = WorkerThreadStatus::kSleeping;
-    // go idle
-}
-
-
-static void* WorkerThreadAllocFunc()
-{
-    return new WorkerThreadLocalStorage;
-}
-
-
-
-class btTaskSchedulerDefault : public btITaskScheduler
-{
-    JobContext m_jobContext;
-    b3ThreadSupportInterface* m_threadSupport;
-    btAlignedObjectArray<ParallelForJob> m_jobs;
-    btSpinMutex m_antiNestingLock;  // prevent nested parallel-for
-    int m_numThreads;
-    int m_numWorkerThreads;
-    int m_numWorkersRunning;
-public:
-
-    btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport")
-    {
-        m_threadSupport = NULL;
-        m_numThreads = getNumHardwareThreads();
-        // if can't detect number of cores,
-        if ( m_numThreads == 0 )
-        {
-            // take a guess
-            m_numThreads = 4;
-        }
-        m_numWorkerThreads = m_numThreads - 1;
-        m_numWorkersRunning = 0;
-    }
-
-    virtual ~btTaskSchedulerDefault()
-    {
-        shutdown();
-    }
-
-    void init()
-    {
-        int maxNumWorkerThreads = BT_MAX_THREAD_COUNT - 1;
-        m_threadSupport = createThreadSupport( maxNumWorkerThreads, WorkerThreadFunc, WorkerThreadAllocFunc, "TaskScheduler" );
-        m_jobContext.m_queueLock = m_threadSupport->createCriticalSection();
-        for ( int i = 0; i < maxNumWorkerThreads; i++ )
-        {
-            WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i );
-            btAssert( storage );
-            storage->threadId = i;
-            storage->status = WorkerThreadStatus::kSleeping;
-        }
-        setWorkersActive( false ); // no work for them yet
-    }
-
-    virtual void shutdown()
-    {
-        setWorkersActive( false );
-        waitForWorkersToSleep();
-        m_threadSupport->deleteCriticalSection( m_jobContext.m_queueLock );
-        m_jobContext.m_queueLock = NULL;
-
-        delete m_threadSupport;
-        m_threadSupport = NULL;
-    }
-
-    void setWorkersActive( bool active )
-    {
-        m_jobContext.m_workersShouldCheckQueue = active;
-    }
-
-    virtual int getMaxNumThreads() const BT_OVERRIDE
-    {
-        return BT_MAX_THREAD_COUNT;
-    }
-
-    virtual int getNumThreads() const BT_OVERRIDE
-    {
-        return m_numThreads;
-    }
-
-    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
-    {
-        m_numThreads = btMax( btMin(numThreads, int(BT_MAX_THREAD_COUNT)), 1 );
-        m_numWorkerThreads = m_numThreads - 1;
-    }
-
-    void waitJobs()
-    {
-        BT_PROFILE( "waitJobs" );
-        // have the main thread work until the job queue is empty
-        for ( ;; )
-        {
-            if ( IJob* job = m_jobContext.consumeJob() )
-            {
-                job->executeJob();
-            }
-            else
-            {
-                break;
-            }
-        }
-        // done with jobs for now, tell workers to rest
-        setWorkersActive( false );
-        waitForWorkersToSleep();
-    }
-
-    void wakeWorkers()
-    {
-        BT_PROFILE( "wakeWorkers" );
-        btAssert( m_jobContext.m_workersShouldCheckQueue );
-        // tell each worker thread to start working
-        for ( int i = 0; i < m_numWorkerThreads; i++ )
-        {
-            m_threadSupport->runTask( B3_THREAD_SCHEDULE_TASK, &m_jobContext, i );
-            m_numWorkersRunning++;
-        }
-    }
-
-    void waitForWorkersToSleep()
-    {
-        BT_PROFILE( "waitForWorkersToSleep" );
-        while ( m_numWorkersRunning > 0 )
-        {
-            int iThread;
-            int threadStatus;
-            m_threadSupport->waitForResponse( &iThread, &threadStatus );  // wait for worker threads to finish working
-            m_numWorkersRunning--;
-        }
-        //m_threadSupport->waitForAllTasksToComplete();
-        for ( int i = 0; i < m_numWorkerThreads; i++ )
-        {
-            //m_threadSupport->waitForTaskCompleted( i );
-            WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i );
-            btAssert( storage );
-            btAssert( storage->status == WorkerThreadStatus::kSleeping );
-        }
-    }
-
-    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
-    {
-        BT_PROFILE( "parallelFor_ThreadSupport" );
-        btAssert( iEnd >= iBegin );
-        btAssert( grainSize >= 1 );
-        int iterationCount = iEnd - iBegin;
-        if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() )
-        {
-            int jobCount = ( iterationCount + grainSize - 1 ) / grainSize;
-            btAssert( jobCount >= 2 );  // need more than one job for multithreading
-            if ( jobCount > m_jobs.size() )
-            {
-                m_jobs.resize( jobCount );
-            }
-            if ( jobCount > m_jobContext.m_jobQueue.capacity() )
-            {
-                m_jobContext.m_jobQueue.reserve( jobCount );
-            }
-
-            m_jobContext.clearQueue();
-            // prepare worker threads for incoming work
-            setWorkersActive( true );
-            wakeWorkers();
-            // submit all of the jobs
-            int iJob = 0;
-            for ( int i = iBegin; i < iEnd; i += grainSize )
-            {
-                btAssert( iJob < jobCount );
-                int iE = btMin( i + grainSize, iEnd );
-                ParallelForJob& job = m_jobs[ iJob ];
-                job.init( i, iE, body );
-                m_jobContext.submitJob( &job );
-                iJob++;
-            }
-
-            // put the main thread to work on emptying the job queue and then wait for all workers to finish
-            waitJobs();
-            m_antiNestingLock.unlock();
-        }
-        else
-        {
-            BT_PROFILE( "parallelFor_mainThread" );
-            // just run on main thread
-            body.forLoop( iBegin, iEnd );
-        }
-    }
-};
-
-
-
-btITaskScheduler* createDefaultTaskScheduler()
-{
-    btTaskSchedulerDefault* ts = new btTaskSchedulerDefault();
-    ts->init();
-    return ts;
-}
-
-#else // #if BT_THREADSAFE
-
-btITaskScheduler* createDefaultTaskScheduler()
-{
-    return NULL;
-}
-
-#endif // #else // #if BT_THREADSAFE
\ No newline at end of file
diff --git a/examples/MultiThreading/btTaskScheduler.h b/examples/MultiThreading/btTaskScheduler.h
deleted file mode 100644
index a83b635eb..000000000
--- a/examples/MultiThreading/btTaskScheduler.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
-Copyright (c) 2003-2014 Erwin Coumans  http://bullet.googlecode.com
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it freely,
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-
-#ifndef BT_TASK_SCHEDULER_H
-#define BT_TASK_SCHEDULER_H
-
-
-class btITaskScheduler;
-
-btITaskScheduler* createDefaultTaskScheduler();
-
-
-#endif // BT_TASK_SCHEDULER_H
diff --git a/src/BulletDynamics/CMakeLists.txt b/src/BulletDynamics/CMakeLists.txt
index f8a6f34ba..2eb03c39a 100644
--- a/src/BulletDynamics/CMakeLists.txt
+++ b/src/BulletDynamics/CMakeLists.txt
@@ -15,6 +15,8 @@ SET(BulletDynamics_SRCS
 	ConstraintSolver/btHingeConstraint.cpp
 	ConstraintSolver/btPoint2PointConstraint.cpp
 	ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
+	ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
+	ConstraintSolver/btBatchedConstraints.cpp
 	ConstraintSolver/btNNCGConstraintSolver.cpp
 	ConstraintSolver/btSliderConstraint.cpp
 	ConstraintSolver/btSolve2LinearConstraint.cpp
@@ -62,6 +64,7 @@ SET(ConstraintSolver_HDRS
 	ConstraintSolver/btJacobianEntry.h
 	ConstraintSolver/btPoint2PointConstraint.h
 	ConstraintSolver/btSequentialImpulseConstraintSolver.h
+	ConstraintSolver/btSequentialImpulseConstraintSolverMt.h
 	ConstraintSolver/btNNCGConstraintSolver.h
 	ConstraintSolver/btSliderConstraint.h
 	ConstraintSolver/btSolve2LinearConstraint.h
diff --git a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
new file mode 100644
index 000000000..bc840e889
--- /dev/null
+++ b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
@@ -0,0 +1,1129 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "btBatchedConstraints.h"
+
+#include "LinearMath/btIDebugDraw.h"
+#include "LinearMath/btMinMax.h"
+#include "LinearMath/btStackAlloc.h"
+#include "LinearMath/btQuickprof.h"
+
+#include <string.h> //for memset
+
+const int kNoMerge = -1;
+
+bool btBatchedConstraints::s_debugDrawBatches = false;
+
+
+struct btBatchedConstraintInfo
+{
+    int constraintIndex;
+    int numConstraintRows;
+    int bodyIds[2];
+};
+
+
+struct btBatchInfo
+{
+    int phaseId;
+    int numConstraints;
+    int mergeIndex;
+
+    btBatchInfo(int _phaseId = -1) : numConstraints(0), mergeIndex(-1), phaseId(_phaseId) {}
+};
+
+
+bool btBatchedConstraints::validate(btConstraintArray* constraints, const btAlignedObjectArray<btSolverBody>& bodies) const
+{
+    //
+    // validate: for debugging only. Verify coloring of bodies, that no body is touched by more than one batch in any given phase
+    //
+    int errors = 0;
+    const int kUnassignedBatch = -1;
+
+    btAlignedObjectArray<int> bodyBatchId;
+    for (int iPhase = 0; iPhase < m_phases.size(); ++iPhase)
+    {
+        bodyBatchId.resizeNoInitialize(0);
+        bodyBatchId.resize( bodies.size(), kUnassignedBatch );
+        const Range& phase = m_phases[iPhase];
+        for (int iBatch = phase.begin; iBatch < phase.end; ++iBatch)
+        {
+            const Range& batch = m_batches[iBatch];
+            for (int iiCons = batch.begin; iiCons < batch.end; ++iiCons)
+            {
+                int iCons = m_constraintIndices[iiCons];
+                const btSolverConstraint& cons = constraints->at(iCons);
+                const btSolverBody& bodyA = bodies[cons.m_solverBodyIdA];
+                const btSolverBody& bodyB = bodies[cons.m_solverBodyIdB];
+                if (! bodyA.internalGetInvMass().isZero())
+                {
+                    int thisBodyBatchId = bodyBatchId[cons.m_solverBodyIdA];
+                    if (thisBodyBatchId == kUnassignedBatch)
+                    {
+                        bodyBatchId[cons.m_solverBodyIdA] = iBatch;
+                    }
+                    else if (thisBodyBatchId != iBatch)
+                    {
+                        btAssert( !"dynamic body is used in 2 different batches in the same phase" );
+                        errors++;
+                    }
+                }
+                if (! bodyB.internalGetInvMass().isZero())
+                {
+                    int thisBodyBatchId = bodyBatchId[cons.m_solverBodyIdB];
+                    if (thisBodyBatchId == kUnassignedBatch)
+                    {
+                        bodyBatchId[cons.m_solverBodyIdB] = iBatch;
+                    }
+                    else if (thisBodyBatchId != iBatch)
+                    {
+                        btAssert( !"dynamic body is used in 2 different batches in the same phase" );
+                        errors++;
+                    }
+                }
+            }
+        }
+    }
+    return errors == 0;
+}
+
+
+static void debugDrawSingleBatch( const btBatchedConstraints* bc,
+    btConstraintArray* constraints,
+    const btAlignedObjectArray<btSolverBody>& bodies,
+    int iBatch,
+    const btVector3& color,
+    const btVector3& offset
+    )
+{
+    if (bc && bc->m_debugDrawer && iBatch < bc->m_batches.size())
+    {
+        const btBatchedConstraints::Range& b = bc->m_batches[iBatch];
+        for (int iiCon = b.begin; iiCon < b.end; ++iiCon)
+        {
+            int iCon = bc->m_constraintIndices[iiCon];
+            const btSolverConstraint& con = constraints->at(iCon);
+            int iBody0 = con.m_solverBodyIdA;
+            int iBody1 = con.m_solverBodyIdB;
+            btVector3 pos0 = bodies[iBody0].getWorldTransform().getOrigin() + offset;
+            btVector3 pos1 = bodies[iBody1].getWorldTransform().getOrigin() + offset;
+            bc->m_debugDrawer->drawLine(pos0, pos1, color);
+        }
+    }
+}
+
+
+static void debugDrawPhase( const btBatchedConstraints* bc,
+    btConstraintArray* constraints,
+    const btAlignedObjectArray<btSolverBody>& bodies,
+    int iPhase,
+    const btVector3& color0,
+    const btVector3& color1,
+    const btVector3& offset
+    )
+{
+    BT_PROFILE( "debugDrawPhase" );
+    if ( bc && bc->m_debugDrawer && iPhase < bc->m_phases.size() )
+    {
+        const btBatchedConstraints::Range& phase = bc->m_phases[iPhase];
+        for (int iBatch = phase.begin; iBatch < phase.end; ++iBatch)
+        {
+            float tt = float(iBatch - phase.begin) / float(btMax(1, phase.end - phase.begin - 1));
+            btVector3 col = lerp(color0, color1, tt);
+            debugDrawSingleBatch(bc, constraints, bodies, iBatch, col, offset);
+        }
+    }
+}
+
+
+static void debugDrawAllBatches( const btBatchedConstraints* bc,
+    btConstraintArray* constraints,
+    const btAlignedObjectArray<btSolverBody>& bodies
+    )
+{
+    BT_PROFILE( "debugDrawAllBatches" );
+    if ( bc && bc->m_debugDrawer && bc->m_phases.size() > 0 )
+    {
+        btVector3 bboxMin(BT_LARGE_FLOAT, BT_LARGE_FLOAT, BT_LARGE_FLOAT);
+        btVector3 bboxMax = -bboxMin;
+        for (int iBody = 0; iBody < bodies.size(); ++iBody)
+        {
+            const btVector3& pos = bodies[iBody].getWorldTransform().getOrigin();
+            bboxMin.setMin(pos);
+            bboxMax.setMax(pos);
+        }
+        btVector3 bboxExtent = bboxMax - bboxMin;
+        btVector3 offsetBase = btVector3( 0, bboxExtent.y()*1.1f, 0 );
+        btVector3 offsetStep = btVector3( 0, 0, bboxExtent.z()*1.1f );
+        int numPhases = bc->m_phases.size();
+        for (int iPhase = 0; iPhase < numPhases; ++iPhase)
+        {
+            float b = float(iPhase)/float(numPhases-1);
+            btVector3 color0 = btVector3(1,0,b);
+            btVector3 color1 = btVector3(0,1,b);
+            btVector3 offset = offsetBase + offsetStep*(float(iPhase) - float(numPhases-1)*0.5);
+            debugDrawPhase(bc, constraints, bodies, iPhase, color0, color1, offset);
+        }
+    }
+}
+
+
+static void initBatchedBodyDynamicFlags(btAlignedObjectArray<bool>* outBodyDynamicFlags, const btAlignedObjectArray<btSolverBody>& bodies)
+{
+    BT_PROFILE("initBatchedBodyDynamicFlags");
+    btAlignedObjectArray<bool>& bodyDynamicFlags = *outBodyDynamicFlags;
+    bodyDynamicFlags.resizeNoInitialize(bodies.size());
+    for (int i = 0; i < bodies.size(); ++i)
+    {
+        const btSolverBody& body = bodies[ i ];
+        bodyDynamicFlags[i] = ( body.internalGetInvMass().x() > btScalar( 0 ) );
+    }
+}
+
+
+static int runLengthEncodeConstraintInfo(btBatchedConstraintInfo* outConInfos, int numConstraints)
+{
+    BT_PROFILE("runLengthEncodeConstraintInfo");
+    // detect and run-length encode constraint rows that repeat the same bodies
+    int iDest = 0;
+    int iSrc = 0;
+    while (iSrc < numConstraints)
+    {
+        const btBatchedConstraintInfo& srcConInfo = outConInfos[iSrc];
+        btBatchedConstraintInfo& conInfo = outConInfos[iDest];
+        conInfo.constraintIndex = iSrc;
+        conInfo.bodyIds[0] = srcConInfo.bodyIds[0];
+        conInfo.bodyIds[1] = srcConInfo.bodyIds[1];
+        while (iSrc < numConstraints && outConInfos[iSrc].bodyIds[0] == srcConInfo.bodyIds[0] && outConInfos[iSrc].bodyIds[1] == srcConInfo.bodyIds[1])
+        {
+            ++iSrc;
+        }
+        conInfo.numConstraintRows = iSrc - conInfo.constraintIndex;
+        btAssert( conInfo.numConstraintRows <= 6 );
+        ++iDest;
+    }
+    return iDest;
+}
+
+
+struct ReadSolverConstraintsLoop : public btIParallelForBody
+{
+    btBatchedConstraintInfo* m_outConInfos;
+    btConstraintArray* m_constraints;
+
+    ReadSolverConstraintsLoop( btBatchedConstraintInfo* outConInfos, btConstraintArray* constraints )
+    {
+        m_outConInfos = outConInfos;
+        m_constraints = constraints;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        for (int i = iBegin; i < iEnd; ++i)
+        {
+            btBatchedConstraintInfo& conInfo = m_outConInfos[i];
+            const btSolverConstraint& con = m_constraints->at( i );
+            conInfo.bodyIds[0] = con.m_solverBodyIdA;
+            conInfo.bodyIds[1] = con.m_solverBodyIdB;
+            conInfo.constraintIndex = i;
+            conInfo.numConstraintRows = 1;
+        }
+    }
+};
+
+
+static int initBatchedConstraintInfo(btBatchedConstraintInfo* outConInfos, btConstraintArray* constraints)
+{
+    BT_PROFILE("initBatchedConstraintInfo");
+    int numConstraints = constraints->size();
+    bool inParallel = true;
+    if (inParallel)
+    {
+        ReadSolverConstraintsLoop loop(outConInfos, constraints);
+        int grainSize = 1200;
+        btParallelFor(0, numConstraints, grainSize, loop);
+    }
+    else
+    {
+        for (int i = 0; i < numConstraints; ++i)
+        {
+            btBatchedConstraintInfo& conInfo = outConInfos[i];
+            const btSolverConstraint& con = constraints->at( i );
+            conInfo.bodyIds[0] = con.m_solverBodyIdA;
+            conInfo.bodyIds[1] = con.m_solverBodyIdB;
+            conInfo.constraintIndex = i;
+            conInfo.numConstraintRows = 1;
+        }
+    }
+    bool useRunLengthEncoding = true;
+    if (useRunLengthEncoding)
+    {
+        numConstraints = runLengthEncodeConstraintInfo(outConInfos, numConstraints);
+    }
+    return numConstraints;
+}
+
+
+static void expandConstraintRowsInPlace(int* constraintBatchIds, const btBatchedConstraintInfo* conInfos, int numConstraints, int numConstraintRows)
+{
+    BT_PROFILE("expandConstraintRowsInPlace");
+    if (numConstraintRows > numConstraints)
+    {
+        // we walk the array in reverse to avoid overwriteing
+        for (int iCon = numConstraints - 1; iCon >= 0; --iCon)
+        {
+            const btBatchedConstraintInfo& conInfo = conInfos[iCon];
+            int iBatch = constraintBatchIds[iCon];
+            for (int i = conInfo.numConstraintRows - 1; i >= 0; --i)
+            {
+                int iDest = conInfo.constraintIndex + i;
+                btAssert(iDest >= iCon);
+                btAssert(iDest >= 0 && iDest < numConstraintRows);
+                constraintBatchIds[iDest] = iBatch;
+            }
+        }
+    }
+}
+
+
+static void expandConstraintRows(int* destConstraintBatchIds, const int* srcConstraintBatchIds, const btBatchedConstraintInfo* conInfos, int numConstraints, int numConstraintRows)
+{
+    BT_PROFILE("expandConstraintRows");
+    for ( int iCon = 0; iCon < numConstraints; ++iCon )
+    {
+        const btBatchedConstraintInfo& conInfo = conInfos[ iCon ];
+        int iBatch = srcConstraintBatchIds[ iCon ];
+        for ( int i = 0; i < conInfo.numConstraintRows; ++i )
+        {
+            int iDest = conInfo.constraintIndex + i;
+            btAssert( iDest >= iCon );
+            btAssert( iDest >= 0 && iDest < numConstraintRows );
+            destConstraintBatchIds[ iDest ] = iBatch;
+        }
+    }
+}
+
+
+struct ExpandConstraintRowsLoop : public btIParallelForBody
+{
+    int* m_destConstraintBatchIds;
+    const int* m_srcConstraintBatchIds;
+    const btBatchedConstraintInfo* m_conInfos;
+    int m_numConstraintRows;
+
+    ExpandConstraintRowsLoop( int* destConstraintBatchIds, const int* srcConstraintBatchIds, const btBatchedConstraintInfo* conInfos, int numConstraintRows)
+    {
+        m_destConstraintBatchIds = destConstraintBatchIds;
+        m_srcConstraintBatchIds = srcConstraintBatchIds;
+        m_conInfos = conInfos;
+        m_numConstraintRows = numConstraintRows;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        expandConstraintRows(m_destConstraintBatchIds, m_srcConstraintBatchIds + iBegin, m_conInfos + iBegin, iEnd - iBegin, m_numConstraintRows);
+    }
+};
+
+
+static void expandConstraintRowsMt(int* destConstraintBatchIds, const int* srcConstraintBatchIds, const btBatchedConstraintInfo* conInfos, int numConstraints, int numConstraintRows)
+{
+    BT_PROFILE("expandConstraintRowsMt");
+    ExpandConstraintRowsLoop loop(destConstraintBatchIds, srcConstraintBatchIds, conInfos, numConstraintRows);
+    int grainSize = 600;
+    btParallelFor(0, numConstraints, grainSize, loop);
+}
+
+
+static void initBatchedConstraintInfoArray(btAlignedObjectArray<btBatchedConstraintInfo>* outConInfos, btConstraintArray* constraints)
+{
+    BT_PROFILE("initBatchedConstraintInfoArray");
+    btAlignedObjectArray<btBatchedConstraintInfo>& conInfos = *outConInfos;
+    int numConstraints = constraints->size();
+    conInfos.resizeNoInitialize(numConstraints);
+
+    int newSize = initBatchedConstraintInfo(&outConInfos->at(0), constraints);
+    conInfos.resizeNoInitialize(newSize);
+}
+
+
+static void mergeSmallBatches(btBatchInfo* batches, int iBeginBatch, int iEndBatch, int minBatchSize, int maxBatchSize)
+{
+    BT_PROFILE("mergeSmallBatches");
+    for ( int iBatch = iEndBatch - 1; iBatch >= iBeginBatch; --iBatch )
+    {
+        btBatchInfo& batch = batches[ iBatch ];
+        if ( batch.mergeIndex == kNoMerge && batch.numConstraints > 0 && batch.numConstraints < minBatchSize )
+        {
+            for ( int iDestBatch = iBatch - 1; iDestBatch >= iBeginBatch; --iDestBatch )
+            {
+                btBatchInfo& destBatch = batches[ iDestBatch ];
+                if ( destBatch.mergeIndex == kNoMerge && ( destBatch.numConstraints + batch.numConstraints ) < maxBatchSize )
+                {
+                    destBatch.numConstraints += batch.numConstraints;
+                    batch.numConstraints = 0;
+                    batch.mergeIndex = iDestBatch;
+                    break;
+                }
+            }
+        }
+    }
+    // flatten mergeIndexes
+    // e.g. in case where A was merged into B and then B was merged into C, we need A to point to C instead of B
+    // Note: loop goes forward through batches because batches always merge from higher indexes to lower,
+    //     so by going from low to high it reduces the amount of trail-following
+    for ( int iBatch = iBeginBatch; iBatch < iEndBatch; ++iBatch )
+    {
+        btBatchInfo& batch = batches[ iBatch ];
+        if ( batch.mergeIndex != kNoMerge )
+        {
+            int iMergeDest = batches[ batch.mergeIndex ].mergeIndex;
+            // follow trail of merges to the end
+            while ( iMergeDest != kNoMerge )
+            {
+                int iNext = batches[ iMergeDest ].mergeIndex;
+                if ( iNext == kNoMerge )
+                {
+                    batch.mergeIndex = iMergeDest;
+                    break;
+                }
+                iMergeDest = iNext;
+            }
+        }
+    }
+}
+
+
+static void updateConstraintBatchIdsForMerges(int* constraintBatchIds, int numConstraints, const btBatchInfo* batches, int numBatches)
+{
+    BT_PROFILE("updateConstraintBatchIdsForMerges");
+    // update batchIds to account for merges
+    for (int i = 0; i < numConstraints; ++i)
+    {
+        int iBatch = constraintBatchIds[i];
+        btAssert(iBatch < numBatches);
+        // if this constraint references a batch that was merged into another batch
+        if (batches[iBatch].mergeIndex != kNoMerge)
+        {
+            // update batchId
+            constraintBatchIds[i] = batches[iBatch].mergeIndex;
+        }
+    }
+}
+
+
+struct UpdateConstraintBatchIdsForMergesLoop : public btIParallelForBody
+{
+    int* m_constraintBatchIds;
+    const btBatchInfo* m_batches;
+    int m_numBatches;
+
+    UpdateConstraintBatchIdsForMergesLoop( int* constraintBatchIds, const btBatchInfo* batches, int numBatches )
+    {
+        m_constraintBatchIds = constraintBatchIds;
+        m_batches = batches;
+        m_numBatches = numBatches;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        BT_PROFILE( "UpdateConstraintBatchIdsForMergesLoop" );
+        updateConstraintBatchIdsForMerges( m_constraintBatchIds + iBegin, iEnd - iBegin, m_batches, m_numBatches );
+    }
+};
+
+
+static void updateConstraintBatchIdsForMergesMt(int* constraintBatchIds, int numConstraints, const btBatchInfo* batches, int numBatches)
+{
+    BT_PROFILE( "updateConstraintBatchIdsForMergesMt" );
+    UpdateConstraintBatchIdsForMergesLoop loop(constraintBatchIds, batches, numBatches);
+    int grainSize = 800;
+    btParallelFor(0, numConstraints, grainSize, loop);
+}
+
+
+inline bool BatchCompare(const btBatchedConstraints::Range& a, const btBatchedConstraints::Range& b)
+{
+    int lenA = a.end - a.begin;
+    int lenB = b.end - b.begin;
+    return lenA > lenB;
+}
+
+
+static void writeOutConstraintIndicesForRangeOfBatches(btBatchedConstraints* bc,
+    const int* constraintBatchIds,
+    int numConstraints,
+    int* constraintIdPerBatch,
+    int batchBegin,
+    int batchEnd
+    )
+{
+    BT_PROFILE("writeOutConstraintIndicesForRangeOfBatches");
+    for ( int iCon = 0; iCon < numConstraints; ++iCon )
+    {
+        int iBatch = constraintBatchIds[ iCon ];
+        if (iBatch >= batchBegin && iBatch < batchEnd)
+        {
+            int iDestCon = constraintIdPerBatch[ iBatch ];
+            constraintIdPerBatch[ iBatch ] = iDestCon + 1;
+            bc->m_constraintIndices[ iDestCon ] = iCon;
+        }
+    }
+}
+
+
+struct WriteOutConstraintIndicesLoop : public btIParallelForBody
+{
+    btBatchedConstraints* m_batchedConstraints;
+    const int* m_constraintBatchIds;
+    int m_numConstraints;
+    int* m_constraintIdPerBatch;
+    int m_maxNumBatchesPerPhase;
+
+    WriteOutConstraintIndicesLoop( btBatchedConstraints* bc, const int* constraintBatchIds, int numConstraints, int* constraintIdPerBatch, int maxNumBatchesPerPhase )
+    {
+        m_batchedConstraints = bc;
+        m_constraintBatchIds = constraintBatchIds;
+        m_numConstraints = numConstraints;
+        m_constraintIdPerBatch = constraintIdPerBatch;
+        m_maxNumBatchesPerPhase = maxNumBatchesPerPhase;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        BT_PROFILE( "WriteOutConstraintIndicesLoop" );
+        int batchBegin = iBegin * m_maxNumBatchesPerPhase;
+        int batchEnd = iEnd * m_maxNumBatchesPerPhase;
+        writeOutConstraintIndicesForRangeOfBatches(m_batchedConstraints,
+            m_constraintBatchIds,
+            m_numConstraints,
+            m_constraintIdPerBatch,
+            batchBegin,
+            batchEnd
+        );
+    }
+};
+
+
+static void writeOutConstraintIndicesMt(btBatchedConstraints* bc,
+    const int* constraintBatchIds,
+    int numConstraints,
+    int* constraintIdPerBatch,
+    int maxNumBatchesPerPhase,
+    int numPhases
+    )
+{
+    BT_PROFILE("writeOutConstraintIndicesMt");
+    bool inParallel = true;
+    if (inParallel)
+    {
+        WriteOutConstraintIndicesLoop loop( bc, constraintBatchIds, numConstraints, constraintIdPerBatch, maxNumBatchesPerPhase );
+        btParallelFor( 0, numPhases, 1, loop );
+    }
+    else
+    {
+        for ( int iCon = 0; iCon < numConstraints; ++iCon )
+        {
+            int iBatch = constraintBatchIds[ iCon ];
+            int iDestCon = constraintIdPerBatch[ iBatch ];
+            constraintIdPerBatch[ iBatch ] = iDestCon + 1;
+            bc->m_constraintIndices[ iDestCon ] = iCon;
+        }
+    }
+}
+
+
+static void writeGrainSizes(btBatchedConstraints* bc)
+{
+    typedef btBatchedConstraints::Range Range;
+    int numPhases = bc->m_phases.size();
+    bc->m_phaseGrainSize.resizeNoInitialize(numPhases);
+    int numThreads = btGetTaskScheduler()->getNumThreads();
+    for (int iPhase = 0; iPhase < numPhases; ++iPhase)
+    {
+        const Range& phase = bc->m_phases[ iPhase ];
+        int numBatches = phase.end - phase.begin;
+        float grainSize = floor((0.25f*numBatches / float(numThreads)) + 0.0f);
+        bc->m_phaseGrainSize[ iPhase ] = btMax(1, int(grainSize));
+    }
+}
+
+
+static void writeOutBatches(btBatchedConstraints* bc,
+    const int* constraintBatchIds,
+    int numConstraints,
+    const btBatchInfo* batches,
+    int* batchWork,
+    int maxNumBatchesPerPhase,
+    int numPhases
+)
+{
+    BT_PROFILE("writeOutBatches");
+    typedef btBatchedConstraints::Range Range;
+    bc->m_constraintIndices.reserve( numConstraints );
+    bc->m_batches.resizeNoInitialize( 0 );
+    bc->m_phases.resizeNoInitialize( 0 );
+
+    //int maxNumBatches = numPhases * maxNumBatchesPerPhase;
+    {
+        int* constraintIdPerBatch = batchWork;  // for each batch, keep an index into the next available slot in the m_constraintIndices array
+        int iConstraint = 0;
+        for (int iPhase = 0; iPhase < numPhases; ++iPhase)
+        {
+            int curPhaseBegin = bc->m_batches.size();
+            int iBegin = iPhase * maxNumBatchesPerPhase;
+            int iEnd = iBegin + maxNumBatchesPerPhase;
+            for ( int i = iBegin; i < iEnd; ++i )
+            {
+                const btBatchInfo& batch = batches[ i ];
+                int curBatchBegin = iConstraint;
+                constraintIdPerBatch[ i ] = curBatchBegin;  // record the start of each batch in m_constraintIndices array
+                int numConstraints = batch.numConstraints;
+                iConstraint += numConstraints;
+                if ( numConstraints > 0 )
+                {
+                    bc->m_batches.push_back( Range( curBatchBegin, iConstraint ) );
+                }
+            }
+            // if any batches were emitted this phase,
+            if ( bc->m_batches.size() > curPhaseBegin )
+            {
+                // output phase
+                bc->m_phases.push_back( Range( curPhaseBegin, bc->m_batches.size() ) );
+            }
+        }
+
+        btAssert(iConstraint == numConstraints);
+        bc->m_constraintIndices.resizeNoInitialize( numConstraints );
+        writeOutConstraintIndicesMt( bc, constraintBatchIds, numConstraints, constraintIdPerBatch, maxNumBatchesPerPhase, numPhases );
+    }
+    // for each phase
+    for (int iPhase = 0; iPhase < bc->m_phases.size(); ++iPhase)
+    {
+        // sort the batches from largest to smallest (can be helpful to some task schedulers)
+        const Range& curBatches = bc->m_phases[iPhase];
+        bc->m_batches.quickSortInternal(BatchCompare, curBatches.begin, curBatches.end-1);
+    }
+    bc->m_phaseOrder.resize(bc->m_phases.size());
+    for (int i = 0; i < bc->m_phases.size(); ++i)
+    {
+        bc->m_phaseOrder[i] = i;
+    }
+    writeGrainSizes(bc);
+}
+
+
+//
+// PreallocatedMemoryHelper -- helper object for allocating a number of chunks of memory in a single contiguous block.
+//                             It is generally more efficient to do a single larger allocation than many smaller allocations.
+//
+// Example Usage:
+//
+//  btVector3* bodyPositions = NULL;
+//  btBatchedConstraintInfo* conInfos = NULL;
+//  {
+//    PreallocatedMemoryHelper<8> memHelper;
+//    memHelper.addChunk( (void**) &bodyPositions, sizeof( btVector3 ) * bodies.size() );
+//    memHelper.addChunk( (void**) &conInfos, sizeof( btBatchedConstraintInfo ) * numConstraints );
+//    void* memPtr = malloc( memHelper.getSizeToAllocate() );  // allocate the memory
+//    memHelper.setChunkPointers( memPtr );  // update pointers to chunks
+//  }
+template <int N>
+class PreallocatedMemoryHelper
+{
+    struct Chunk
+    {
+        void** ptr;
+        size_t size;
+    };
+    Chunk m_chunks[N];
+    int m_numChunks;
+public:
+    PreallocatedMemoryHelper() {m_numChunks=0;}
+    void addChunk( void** ptr, size_t sz )
+    {
+        btAssert( m_numChunks < N );
+        if ( m_numChunks < N )
+        {
+            Chunk& chunk = m_chunks[ m_numChunks ];
+            chunk.ptr = ptr;
+            chunk.size = sz;
+            m_numChunks++;
+        }
+    }
+    size_t getSizeToAllocate() const
+    {
+        size_t totalSize = 0;
+        for (int i = 0; i < m_numChunks; ++i)
+        {
+            totalSize += m_chunks[i].size;
+        }
+        return totalSize;
+    }
+    void setChunkPointers(void* mem) const
+    {
+        size_t totalSize = 0;
+        for (int i = 0; i < m_numChunks; ++i)
+        {
+            const Chunk& chunk = m_chunks[ i ];
+            char* chunkPtr = static_cast<char*>(mem) + totalSize;
+            *chunk.ptr = chunkPtr;
+            totalSize += chunk.size;
+        }
+    }
+};
+
+
+
+static btVector3 findMaxDynamicConstraintExtent(
+    btVector3* bodyPositions,
+    bool* bodyDynamicFlags,
+    btBatchedConstraintInfo* conInfos,
+    int numConstraints,
+    int numBodies
+    )
+{
+    BT_PROFILE("findMaxDynamicConstraintExtent");
+    btVector3 consExtent = btVector3(1,1,1) * 0.001;
+    for (int iCon = 0; iCon < numConstraints; ++iCon)
+    {
+        const btBatchedConstraintInfo& con = conInfos[ iCon ];
+        int iBody0 = con.bodyIds[0];
+        int iBody1 = con.bodyIds[1];
+        btAssert(iBody0 >= 0 && iBody0 < numBodies);
+        btAssert(iBody1 >= 0 && iBody1 < numBodies);
+        // is it a dynamic constraint?
+        if (bodyDynamicFlags[iBody0] && bodyDynamicFlags[iBody1])
+        {
+            btVector3 delta = bodyPositions[iBody1] - bodyPositions[iBody0];
+            consExtent.setMax(delta.absolute());
+        }
+    }
+    return consExtent;
+}
+
+
+struct btIntVec3
+{
+    int m_ints[ 3 ];
+
+    SIMD_FORCE_INLINE const int& operator[](int i) const {return m_ints[i];}
+    SIMD_FORCE_INLINE int&       operator[](int i)       {return m_ints[i];}
+};
+
+
+struct AssignConstraintsToGridBatchesParams
+{
+    bool* bodyDynamicFlags;
+    btIntVec3* bodyGridCoords;
+    int numBodies;
+    btBatchedConstraintInfo* conInfos;
+    char* constraintPhaseIds;
+    int* constraintBatchIds;
+    btIntVec3 gridChunkDim;
+    int maxNumBatchesPerPhase;
+    int numPhases;
+    int phaseMask;
+
+    AssignConstraintsToGridBatchesParams()
+    {
+        memset(this, 0, sizeof(*this));
+    }
+};
+
+
+static void assignConstraintsToGridBatches(const AssignConstraintsToGridBatchesParams& params, int iConBegin, int iConEnd)
+{
+    BT_PROFILE("assignConstraintsToGridBatches");
+    // (can be done in parallel)
+    for ( int iCon = iConBegin; iCon < iConEnd; ++iCon )
+    {
+        const btBatchedConstraintInfo& con = params.conInfos[ iCon ];
+        int iBody0 = con.bodyIds[ 0 ];
+        int iBody1 = con.bodyIds[ 1 ];
+        int iPhase = iCon; //iBody0; // pseudorandom choice to distribute evenly amongst phases
+        iPhase &= params.phaseMask;
+        int gridCoord[ 3 ];
+        // is it a dynamic constraint?
+        if ( params.bodyDynamicFlags[ iBody0 ] && params.bodyDynamicFlags[ iBody1 ] )
+        {
+            const btIntVec3& body0Coords = params.bodyGridCoords[iBody0];
+            const btIntVec3& body1Coords = params.bodyGridCoords[iBody1];
+            // for each dimension x,y,z,
+            for (int i = 0; i < 3; ++i)
+            {
+                int coordMin = btMin(body0Coords.m_ints[i], body1Coords.m_ints[i]);
+                int coordMax = btMax(body0Coords.m_ints[i], body1Coords.m_ints[i]);
+                if (coordMin != coordMax)
+                {
+                    btAssert( coordMax == coordMin + 1 );
+                    if ((coordMin&1) == 0)
+                    {
+                        iPhase &= ~(1 << i); // force bit off
+                    }
+                    else
+                    {
+                        iPhase |= (1 << i); // force bit on
+                        iPhase &= params.phaseMask;
+                    }
+                }
+                gridCoord[ i ] = coordMin;
+            }
+        }
+        else
+        {
+            if ( !params.bodyDynamicFlags[ iBody0 ] )
+            {
+                iBody0 = con.bodyIds[ 1 ];
+            }
+            btAssert(params.bodyDynamicFlags[ iBody0 ]);
+            const btIntVec3& body0Coords = params.bodyGridCoords[iBody0];
+            // for each dimension x,y,z,
+            for ( int i = 0; i < 3; ++i )
+            {
+                gridCoord[ i ] = body0Coords.m_ints[ i ];
+            }
+        }
+        // calculate chunk coordinates
+        int chunkCoord[ 3 ];
+        btIntVec3 gridChunkDim = params.gridChunkDim;
+        // for each dimension x,y,z,
+        for ( int i = 0; i < 3; ++i )
+        {
+            int coordOffset = ( iPhase >> i ) & 1;
+            chunkCoord[ i ] = (gridCoord[ i ] - coordOffset)/2;
+            btClamp( chunkCoord[ i ], 0, gridChunkDim[ i ] - 1);
+            btAssert( chunkCoord[ i ] < gridChunkDim[ i ] );
+        }
+        int iBatch = iPhase * params.maxNumBatchesPerPhase + chunkCoord[ 0 ] + chunkCoord[ 1 ] * gridChunkDim[ 0 ] + chunkCoord[ 2 ] * gridChunkDim[ 0 ] * gridChunkDim[ 1 ];
+        btAssert(iBatch >= 0 && iBatch < params.maxNumBatchesPerPhase*params.numPhases);
+        params.constraintPhaseIds[ iCon ] = iPhase;
+        params.constraintBatchIds[ iCon ] = iBatch;
+    }
+}
+
+
+struct AssignConstraintsToGridBatchesLoop : public btIParallelForBody
+{
+    const AssignConstraintsToGridBatchesParams* m_params;
+
+    AssignConstraintsToGridBatchesLoop( const AssignConstraintsToGridBatchesParams& params )
+    {
+        m_params = &params;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        assignConstraintsToGridBatches(*m_params, iBegin, iEnd);
+    }
+};
+
+
+//
+// setupSpatialGridBatchesMt -- generate batches using a uniform 3D grid
+//
+/*
+
+Bodies are treated as 3D points at their center of mass. We only consider dynamic bodies at this stage,
+kinematic and static bodies are dealt with at a later stage. Also we only consider constraints that
+are between 2 dynamic bodies ("dynamic" constraints) -- constraints that involve a static or kinematic body are handled later
+
+1. Compute a bounding box around all dynamic bodies
+2. Compute the maximum extent of all dynamic constraints. Each dynamic constraint is treated as a line segment, and we need the size of
+   box that will fully enclose any single dynamic constraint
+
+3. Establish the cell size of our grid, the cell size in each dimension must be at least as large as the dynamic constraints max-extent,
+   so that no dynamic constraint can span more than 2 cells of our grid on any axis of the grid. The cell size should be adjusted
+   larger in order to keep the total number of cells from being excessively high
+
+Key idea: Given that each constraint spans 1 or 2 grid cells in each dimension, we can handle all dynamic constraints by processing
+          in chunks of 2x2x2 cells with 8 different 1-cell offsets ((0,0,0),(0,0,1),(0,1,0),(0,1,1),(1,0,0)...).
+          For each of the 8 offsets, we create a phase, and for each 2x2x2 chunk with dynamic constraints becomes a batch in that phase.
+
+ Once all of the phases have been populated, if any of the phases end up with too few batches, they could possibly be merged with other phases.
+
+ Finally, we handle all of the remaining (non-dynamic) constraints, these can be added to whichever phase is least populated to help
+ even things out
+
+*/
+//
+static void setupSpatialGridBatchesMt(
+    btBatchedConstraints* batchedConstraints,
+    btAlignedObjectArray<char>* scratchMemory,
+    btConstraintArray* constraints,
+    const btAlignedObjectArray<btSolverBody>& bodies,
+    int minBatchSize,
+    int maxBatchSize,
+    bool use2DGrid
+)
+{
+    BT_PROFILE("setupSpatialGridBatchesMt");
+    const int numPhases = 8;
+    int numConstraints = constraints->size();
+    int numConstraintRows = constraints->size();
+
+    const int maxGridChunkCount = 128;
+    int allocNumBatchesPerPhase = maxGridChunkCount;
+    int minNumBatchesPerPhase = 16;
+    int allocNumBatches = allocNumBatchesPerPhase * numPhases;
+
+    btVector3* bodyPositions = NULL;
+    bool* bodyDynamicFlags = NULL;
+    btIntVec3* bodyGridCoords = NULL;
+    btBatchInfo* batches = NULL;
+    int* batchWork = NULL;
+    btBatchedConstraintInfo* conInfos = NULL;
+    char* constraintPhaseIds = NULL;
+    int* constraintBatchIds = NULL;
+    int* constraintRowBatchIds = NULL;
+    {
+        PreallocatedMemoryHelper<10> memHelper;
+        memHelper.addChunk( (void**) &bodyPositions, sizeof( btVector3 ) * bodies.size() );
+        memHelper.addChunk( (void**) &bodyDynamicFlags, sizeof( bool ) * bodies.size() );
+        memHelper.addChunk( (void**) &bodyGridCoords, sizeof( btIntVec3 ) * bodies.size() );
+        memHelper.addChunk( (void**) &batches, sizeof( btBatchInfo )* allocNumBatches );
+        memHelper.addChunk( (void**) &batchWork, sizeof( int )* allocNumBatches );
+        memHelper.addChunk( (void**) &conInfos, sizeof( btBatchedConstraintInfo ) * numConstraints );
+        memHelper.addChunk( (void**) &constraintPhaseIds, sizeof( char ) * numConstraints );
+        memHelper.addChunk( (void**) &constraintBatchIds, sizeof( int ) * numConstraints );
+        memHelper.addChunk( (void**) &constraintRowBatchIds, sizeof( int ) * numConstraintRows );
+        size_t scratchSize = memHelper.getSizeToAllocate();
+        scratchMemory->resizeNoInitialize( scratchSize );
+        char* memPtr = &scratchMemory->at(0);
+        memHelper.setChunkPointers( memPtr );
+    }
+
+    numConstraints = initBatchedConstraintInfo(conInfos, constraints);
+
+    // compute bounding box around all dynamic bodies
+    // (could be done in parallel)
+    btVector3 bboxMin(BT_LARGE_FLOAT, BT_LARGE_FLOAT, BT_LARGE_FLOAT);
+    btVector3 bboxMax = -bboxMin;
+    //int dynamicBodyCount = 0;
+    for (int i = 0; i < bodies.size(); ++i)
+    {
+        const btSolverBody& body = bodies[i];
+        btVector3 bodyPos = body.getWorldTransform().getOrigin();
+        bool isDynamic = ( body.internalGetInvMass().x() > btScalar( 0 ) );
+        bodyPositions[i] = bodyPos;
+        bodyDynamicFlags[i] = isDynamic;
+        if (isDynamic)
+        {
+            //dynamicBodyCount++;
+            bboxMin.setMin(bodyPos);
+            bboxMax.setMax(bodyPos);
+        }
+    }
+
+    // find max extent of all dynamic constraints
+    // (could be done in parallel)
+    btVector3 consExtent = findMaxDynamicConstraintExtent(bodyPositions, bodyDynamicFlags, conInfos, numConstraints, bodies.size());
+
+    btVector3 gridExtent = bboxMax - bboxMin;
+
+    btVector3 gridCellSize = consExtent;
+    int gridDim[3];
+    gridDim[ 0 ] = int( 1.0 + gridExtent.x() / gridCellSize.x() );
+    gridDim[ 1 ] = int( 1.0 + gridExtent.y() / gridCellSize.y() );
+    gridDim[ 2 ] = int( 1.0 + gridExtent.z() / gridCellSize.z() );
+
+    // if we can collapse an axis, it will cut our number of phases in half which could be more efficient
+    int phaseMask = 7;
+    bool collapseAxis = use2DGrid;
+    if ( collapseAxis )
+    {
+        // pick the smallest axis to collapse, leaving us with the greatest number of cells in our grid
+        int iAxisToCollapse = 0;
+        int axisDim = gridDim[iAxisToCollapse];
+        //for each dimension
+        for ( int i = 0; i < 3; ++i )
+        {
+            if (gridDim[i] < axisDim)
+            {
+                iAxisToCollapse = i;
+                axisDim = gridDim[i];
+            }
+        }
+        // collapse it
+        gridCellSize[iAxisToCollapse] = gridExtent[iAxisToCollapse] * 2.0f;
+        phaseMask &= ~(1 << iAxisToCollapse);
+    }
+
+    int numGridChunks = 0;
+    btIntVec3 gridChunkDim;  // each chunk is 2x2x2 group of cells
+    while (true)
+    {
+        gridDim[0] = int( 1.0 + gridExtent.x() / gridCellSize.x() );
+        gridDim[1] = int( 1.0 + gridExtent.y() / gridCellSize.y() );
+        gridDim[2] = int( 1.0 + gridExtent.z() / gridCellSize.z() );
+        gridChunkDim[ 0 ] = btMax( 1, ( gridDim[ 0 ] + 0 ) / 2 );
+        gridChunkDim[ 1 ] = btMax( 1, ( gridDim[ 1 ] + 0 ) / 2 );
+        gridChunkDim[ 2 ] = btMax( 1, ( gridDim[ 2 ] + 0 ) / 2 );
+        numGridChunks = gridChunkDim[ 0 ] * gridChunkDim[ 1 ] * gridChunkDim[ 2 ];
+        float nChunks = float(gridChunkDim[0]) * float(gridChunkDim[1]) * float(gridChunkDim[2]);  // suceptible to integer overflow
+        if ( numGridChunks <= maxGridChunkCount && nChunks <= maxGridChunkCount )
+        {
+            break;
+        }
+        gridCellSize *= 1.25; // should roughly cut numCells in half
+    }
+    btAssert(numGridChunks <= maxGridChunkCount );
+    int maxNumBatchesPerPhase = numGridChunks;
+
+    // for each dynamic body, compute grid coords
+    btVector3 invGridCellSize = btVector3(1,1,1)/gridCellSize;
+    // (can be done in parallel)
+    for (int iBody = 0; iBody < bodies.size(); ++iBody)
+    {
+        btIntVec3& coords = bodyGridCoords[iBody];
+        if (bodyDynamicFlags[iBody])
+        {
+            btVector3 v = ( bodyPositions[ iBody ] - bboxMin )*invGridCellSize;
+            coords.m_ints[0] = int(v.x());
+            coords.m_ints[1] = int(v.y());
+            coords.m_ints[2] = int(v.z());
+            btAssert(coords.m_ints[0] >= 0 && coords.m_ints[0] < gridDim[0]);
+            btAssert(coords.m_ints[1] >= 0 && coords.m_ints[1] < gridDim[1]);
+            btAssert(coords.m_ints[2] >= 0 && coords.m_ints[2] < gridDim[2]);
+        }
+        else
+        {
+            coords.m_ints[0] = -1;
+            coords.m_ints[1] = -1;
+            coords.m_ints[2] = -1;
+        }
+    }
+
+    for (int iPhase = 0; iPhase < numPhases; ++iPhase)
+    {
+        int batchBegin = iPhase * maxNumBatchesPerPhase;
+        int batchEnd = batchBegin + maxNumBatchesPerPhase;
+        for ( int iBatch = batchBegin; iBatch < batchEnd; ++iBatch )
+        {
+            btBatchInfo& batch = batches[ iBatch ];
+            batch = btBatchInfo( iPhase );
+        }
+    }
+
+    {
+        AssignConstraintsToGridBatchesParams params;
+        params.bodyDynamicFlags = bodyDynamicFlags;
+        params.bodyGridCoords = bodyGridCoords;
+        params.numBodies = bodies.size();
+        params.conInfos = conInfos;
+        params.constraintPhaseIds = constraintPhaseIds;
+        params.constraintBatchIds = constraintBatchIds;
+        params.gridChunkDim = gridChunkDim;
+        params.maxNumBatchesPerPhase = maxNumBatchesPerPhase;
+        params.numPhases = numPhases;
+        params.phaseMask = phaseMask;
+        bool inParallel = true;
+        if (inParallel)
+        {
+            AssignConstraintsToGridBatchesLoop loop(params);
+            int grainSize = 500;
+            btParallelFor(0, numConstraints, grainSize, loop);
+        }
+        else
+        {
+            assignConstraintsToGridBatches( params, 0, numConstraints );
+        }
+    }
+    for ( int iCon = 0; iCon < numConstraints; ++iCon )
+    {
+        const btBatchedConstraintInfo& con = conInfos[ iCon ];
+        int iBatch = constraintBatchIds[ iCon ];
+        btBatchInfo& batch = batches[iBatch];
+        batch.numConstraints += con.numConstraintRows;
+    }
+
+    for (int iPhase = 0; iPhase < numPhases; ++iPhase)
+    {
+        // if phase is legit,
+        if (iPhase == (iPhase&phaseMask))
+        {
+            int iBeginBatch = iPhase * maxNumBatchesPerPhase;
+            int iEndBatch = iBeginBatch + maxNumBatchesPerPhase;
+            mergeSmallBatches( batches, iBeginBatch, iEndBatch, minBatchSize, maxBatchSize );
+        }
+    }
+    // all constraints have been assigned a batchId
+    updateConstraintBatchIdsForMergesMt(constraintBatchIds, numConstraints, batches, maxNumBatchesPerPhase*numPhases);
+
+    if (numConstraintRows > numConstraints)
+    {
+        expandConstraintRowsMt(&constraintRowBatchIds[0], &constraintBatchIds[0], &conInfos[0], numConstraints, numConstraintRows);
+    }
+    else
+    {
+        constraintRowBatchIds = constraintBatchIds;
+    }
+
+    writeOutBatches(batchedConstraints, constraintRowBatchIds, numConstraintRows, batches, batchWork, maxNumBatchesPerPhase, numPhases);
+    btAssert(batchedConstraints->validate(constraints, bodies));
+}
+
+
+static void setupSingleBatch(
+    btBatchedConstraints* bc,
+    int numConstraints
+)
+{
+    BT_PROFILE("setupSingleBatch");
+    typedef btBatchedConstraints::Range Range;
+
+    bc->m_constraintIndices.resize( numConstraints );
+    for ( int i = 0; i < numConstraints; ++i )
+    {
+        bc->m_constraintIndices[ i ] = i;
+    }
+
+    bc->m_batches.resizeNoInitialize( 0 );
+    bc->m_phases.resizeNoInitialize( 0 );
+    bc->m_phaseOrder.resizeNoInitialize( 0 );
+    bc->m_phaseGrainSize.resizeNoInitialize( 0 );
+
+    if (numConstraints > 0)
+    {
+        bc->m_batches.push_back( Range( 0, numConstraints ) );
+        bc->m_phases.push_back( Range( 0, 1 ) );
+        bc->m_phaseOrder.push_back(0);
+        bc->m_phaseGrainSize.push_back(1);
+    }
+}
+
+
+void btBatchedConstraints::setup(
+    btConstraintArray* constraints,
+    const btAlignedObjectArray<btSolverBody>& bodies,
+    BatchingMethod batchingMethod,
+    int minBatchSize,
+    int maxBatchSize,
+    btAlignedObjectArray<char>* scratchMemory
+    )
+{
+    if (constraints->size() >= minBatchSize*4)
+    {
+        bool use2DGrid = batchingMethod == BatchingMethod::BATCHING_METHOD_SPATIAL_GRID_2D;
+        setupSpatialGridBatchesMt( this, scratchMemory, constraints, bodies, minBatchSize, maxBatchSize, use2DGrid );
+        if (s_debugDrawBatches)
+        {
+            debugDrawAllBatches( this, constraints, bodies );
+        }
+    }
+    else
+    {
+        setupSingleBatch( this, constraints->size() );
+    }
+}
+
+
diff --git a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.h b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.h
new file mode 100644
index 000000000..0fd8f31dd
--- /dev/null
+++ b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.h
@@ -0,0 +1,66 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_BATCHED_CONSTRAINTS_H
+#define BT_BATCHED_CONSTRAINTS_H
+
+#include "LinearMath/btThreads.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "BulletDynamics/ConstraintSolver/btSolverBody.h"
+#include "BulletDynamics/ConstraintSolver/btSolverConstraint.h"
+
+
+class btIDebugDraw;
+
+struct btBatchedConstraints
+{
+    enum BatchingMethod
+    {
+        BATCHING_METHOD_SPATIAL_GRID_2D,
+        BATCHING_METHOD_SPATIAL_GRID_3D,
+        BATCHING_METHOD_COUNT
+    };
+    struct Range
+    {
+        int begin;
+        int end;
+
+        Range() : begin( 0 ), end( 0 ) {}
+        Range( int _beg, int _end ) : begin( _beg ), end( _end ) {}
+    };
+
+    btAlignedObjectArray<int> m_constraintIndices;
+    btAlignedObjectArray<Range> m_batches;  // each batch is a range of indices in the m_constraintIndices array
+    btAlignedObjectArray<Range> m_phases;  // each phase is range of indices in the m_batches array
+    btAlignedObjectArray<char> m_phaseGrainSize;  // max grain size for each phase
+    btAlignedObjectArray<int> m_phaseOrder;  // phases can be done in any order, so we can randomize the order here
+    btIDebugDraw* m_debugDrawer;
+
+    static bool s_debugDrawBatches;
+
+    btBatchedConstraints() {m_debugDrawer=NULL;}
+    void setup( btConstraintArray* constraints,
+        const btAlignedObjectArray<btSolverBody>& bodies,
+        BatchingMethod batchingMethod,
+        int minBatchSize,
+        int maxBatchSize,
+        btAlignedObjectArray<char>* scratchMemory
+    );
+    bool validate( btConstraintArray* constraints, const btAlignedObjectArray<btSolverBody>& bodies ) const;
+};
+
+
+#endif // BT_BATCHED_CONSTRAINTS_H
+
diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
index 1b2f211a1..c2a23dfb2 100644
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
@@ -1258,6 +1258,256 @@ void btSequentialImpulseConstraintSolver::convertContacts(btPersistentManifold**
 	}
 }
 
+
+void btSequentialImpulseConstraintSolver::convertJoint(btSolverConstraint* currentConstraintRow,
+    btTypedConstraint* constraint,
+    const btTypedConstraint::btConstraintInfo1& info1,
+    int solverBodyIdA,
+    int solverBodyIdB,
+    const btContactSolverInfo& infoGlobal
+    )
+{
+	const btRigidBody& rbA = constraint->getRigidBodyA();
+	const btRigidBody& rbB = constraint->getRigidBodyB();
+
+    const btSolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA];
+    const btSolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB];
+
+	int overrideNumSolverIterations = constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;
+	if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)
+		m_maxOverrideNumSolverIterations = overrideNumSolverIterations;
+
+	for (int j=0;j<info1.m_numConstraintRows;j++)
+	{
+		memset(&currentConstraintRow[j],0,sizeof(btSolverConstraint));
+		currentConstraintRow[j].m_lowerLimit = -SIMD_INFINITY;
+		currentConstraintRow[j].m_upperLimit = SIMD_INFINITY;
+		currentConstraintRow[j].m_appliedImpulse = 0.f;
+		currentConstraintRow[j].m_appliedPushImpulse = 0.f;
+		currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;
+		currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;
+		currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;
+	}
+
+    // these vectors are already cleared in initSolverBody, no need to redundantly clear again
+    btAssert(bodyAPtr->getDeltaLinearVelocity().isZero());
+    btAssert(bodyAPtr->getDeltaAngularVelocity().isZero());
+    btAssert(bodyAPtr->getPushVelocity().isZero());
+    btAssert(bodyAPtr->getTurnVelocity().isZero());
+    btAssert(bodyBPtr->getDeltaLinearVelocity().isZero());
+    btAssert(bodyBPtr->getDeltaAngularVelocity().isZero());
+    btAssert(bodyBPtr->getPushVelocity().isZero());
+    btAssert(bodyBPtr->getTurnVelocity().isZero());
+	//bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
+	//bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
+	//bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
+	//bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
+	//bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
+	//bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
+	//bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
+	//bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
+
+
+	btTypedConstraint::btConstraintInfo2 info2;
+	info2.fps = 1.f/infoGlobal.m_timeStep;
+	info2.erp = infoGlobal.m_erp;
+	info2.m_J1linearAxis = currentConstraintRow->m_contactNormal1;
+	info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal;
+	info2.m_J2linearAxis = currentConstraintRow->m_contactNormal2;
+	info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal;
+	info2.rowskip = sizeof(btSolverConstraint)/sizeof(btScalar);//check this
+	///the size of btSolverConstraint needs be a multiple of btScalar
+	btAssert(info2.rowskip*sizeof(btScalar)== sizeof(btSolverConstraint));
+	info2.m_constraintError = &currentConstraintRow->m_rhs;
+	currentConstraintRow->m_cfm = infoGlobal.m_globalCfm;
+	info2.m_damping = infoGlobal.m_damping;
+	info2.cfm = &currentConstraintRow->m_cfm;
+	info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;
+	info2.m_upperLimit = &currentConstraintRow->m_upperLimit;
+	info2.m_numIterations = infoGlobal.m_numIterations;
+	constraint->getInfo2(&info2);
+
+	///finalize the constraint setup
+	for (int j=0;j<info1.m_numConstraintRows;j++)
+	{
+		btSolverConstraint& solverConstraint = currentConstraintRow[j];
+
+		if (solverConstraint.m_upperLimit>=constraint->getBreakingImpulseThreshold())
+		{
+			solverConstraint.m_upperLimit = constraint->getBreakingImpulseThreshold();
+		}
+
+		if (solverConstraint.m_lowerLimit<=-constraint->getBreakingImpulseThreshold())
+		{
+			solverConstraint.m_lowerLimit = -constraint->getBreakingImpulseThreshold();
+		}
+
+		solverConstraint.m_originalContactPoint = constraint;
+
+		{
+			const btVector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal;
+			solverConstraint.m_angularComponentA = constraint->getRigidBodyA().getInvInertiaTensorWorld()*ftorqueAxis1*constraint->getRigidBodyA().getAngularFactor();
+		}
+		{
+			const btVector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal;
+			solverConstraint.m_angularComponentB = constraint->getRigidBodyB().getInvInertiaTensorWorld()*ftorqueAxis2*constraint->getRigidBodyB().getAngularFactor();
+		}
+
+		{
+			btVector3 iMJlA = solverConstraint.m_contactNormal1*rbA.getInvMass();
+			btVector3 iMJaA = rbA.getInvInertiaTensorWorld()*solverConstraint.m_relpos1CrossNormal;
+			btVector3 iMJlB = solverConstraint.m_contactNormal2*rbB.getInvMass();//sign of normal?
+			btVector3 iMJaB = rbB.getInvInertiaTensorWorld()*solverConstraint.m_relpos2CrossNormal;
+
+			btScalar sum = iMJlA.dot(solverConstraint.m_contactNormal1);
+			sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
+			sum += iMJlB.dot(solverConstraint.m_contactNormal2);
+			sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
+			btScalar fsum = btFabs(sum);
+			btAssert(fsum > SIMD_EPSILON);
+			btScalar sorRelaxation = 1.f;//todo: get from globalInfo?
+			solverConstraint.m_jacDiagABInv = fsum>SIMD_EPSILON?sorRelaxation/sum : 0.f;
+		}
+
+		{
+			btScalar rel_vel;
+			btVector3 externalForceImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalForceImpulse : btVector3(0,0,0);
+			btVector3 externalTorqueImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalTorqueImpulse : btVector3(0,0,0);
+
+			btVector3 externalForceImpulseB = bodyBPtr->m_originalBody ? bodyBPtr->m_externalForceImpulse : btVector3(0,0,0);
+			btVector3 externalTorqueImpulseB = bodyBPtr->m_originalBody ?bodyBPtr->m_externalTorqueImpulse : btVector3(0,0,0);
+
+			btScalar vel1Dotn = solverConstraint.m_contactNormal1.dot(rbA.getLinearVelocity()+externalForceImpulseA)
+								+ solverConstraint.m_relpos1CrossNormal.dot(rbA.getAngularVelocity()+externalTorqueImpulseA);
+
+			btScalar vel2Dotn = solverConstraint.m_contactNormal2.dot(rbB.getLinearVelocity()+externalForceImpulseB)
+												+ solverConstraint.m_relpos2CrossNormal.dot(rbB.getAngularVelocity()+externalTorqueImpulseB);
+
+			rel_vel = vel1Dotn+vel2Dotn;
+			btScalar restitution = 0.f;
+			btScalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2
+			btScalar	velocityError = restitution - rel_vel * info2.m_damping;
+			btScalar	penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv;
+			btScalar	velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv;
+			solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
+			solverConstraint.m_appliedImpulse = 0.f;
+		}
+	}
+}
+
+
+void btSequentialImpulseConstraintSolver::convertJoints(btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal)
+{
+    BT_PROFILE("convertJoints");
+	for (int j=0;j<numConstraints;j++)
+	{
+		btTypedConstraint* constraint = constraints[j];
+		constraint->buildJacobian();
+		constraint->internalSetAppliedImpulse(0.0f);
+	}
+
+	int totalNumRows = 0;
+
+	m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints);
+	//calculate the total number of contraint rows
+	for (int i=0;i<numConstraints;i++)
+	{
+		btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
+		btJointFeedback* fb = constraints[i]->getJointFeedback();
+		if (fb)
+		{
+			fb->m_appliedForceBodyA.setZero();
+			fb->m_appliedTorqueBodyA.setZero();
+			fb->m_appliedForceBodyB.setZero();
+			fb->m_appliedTorqueBodyB.setZero();
+		}
+
+		if (constraints[i]->isEnabled())
+		{
+			constraints[i]->getInfo1(&info1);
+		} else
+		{
+			info1.m_numConstraintRows = 0;
+			info1.nub = 0;
+		}
+		totalNumRows += info1.m_numConstraintRows;
+	}
+	m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows);
+
+
+	///setup the btSolverConstraints
+	int currentRow = 0;
+
+	for (int i=0;i<numConstraints;i++)
+	{
+		const btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
+
+		if (info1.m_numConstraintRows)
+		{
+			btAssert(currentRow<totalNumRows);
+
+			btSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[currentRow];
+			btTypedConstraint* constraint = constraints[i];
+			btRigidBody& rbA = constraint->getRigidBodyA();
+			btRigidBody& rbB = constraint->getRigidBodyB();
+
+			int solverBodyIdA = getOrInitSolverBody(rbA,infoGlobal.m_timeStep);
+            int solverBodyIdB = getOrInitSolverBody(rbB,infoGlobal.m_timeStep);
+
+            convertJoint(currentConstraintRow, constraint, info1, solverBodyIdA, solverBodyIdB, infoGlobal);
+        }
+		currentRow+=info1.m_numConstraintRows;
+	}
+}
+
+
+void btSequentialImpulseConstraintSolver::convertBodies(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal)
+{
+    BT_PROFILE("convertBodies");
+	for (int i = 0; i < numBodies; i++)
+	{
+		bodies[i]->setCompanionId(-1);
+	}
+#if BT_THREADSAFE
+    m_kinematicBodyUniqueIdToSolverBodyTable.resize( 0 );
+#endif // BT_THREADSAFE
+
+	m_tmpSolverBodyPool.reserve(numBodies+1);
+	m_tmpSolverBodyPool.resize(0);
+
+	//btSolverBody& fixedBody = m_tmpSolverBodyPool.expand();
+    //initSolverBody(&fixedBody,0);
+
+    for (int i=0;i<numBodies;i++)
+	{
+		int bodyId = getOrInitSolverBody(*bodies[i],infoGlobal.m_timeStep);
+
+		btRigidBody* body = btRigidBody::upcast(bodies[i]);
+		if (body && body->getInvMass())
+		{
+			btSolverBody& solverBody = m_tmpSolverBodyPool[bodyId];
+			btVector3 gyroForce (0,0,0);
+			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_EXPLICIT)
+			{
+				gyroForce = body->computeGyroscopicForceExplicit(infoGlobal.m_maxGyroscopicForce);
+				solverBody.m_externalTorqueImpulse -= gyroForce*body->getInvInertiaTensorWorld()*infoGlobal.m_timeStep;
+			}
+			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_WORLD)
+			{
+				gyroForce = body->computeGyroscopicImpulseImplicit_World(infoGlobal.m_timeStep);
+				solverBody.m_externalTorqueImpulse += gyroForce;
+			}
+			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_BODY)
+			{
+				gyroForce = body->computeGyroscopicImpulseImplicit_Body(infoGlobal.m_timeStep);
+				solverBody.m_externalTorqueImpulse += gyroForce;
+
+			}
+		}
+	}
+}
+
+
 btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
 {
 	m_fixedBodyId = -1;
@@ -1344,250 +1594,13 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 #endif //BT_ADDITIONAL_DEBUG
 
 
-	for (int i = 0; i < numBodies; i++)
-	{
-		bodies[i]->setCompanionId(-1);
-	}
-#if BT_THREADSAFE
-    m_kinematicBodyUniqueIdToSolverBodyTable.resize( 0 );
-#endif // BT_THREADSAFE
-
-	m_tmpSolverBodyPool.reserve(numBodies+1);
-	m_tmpSolverBodyPool.resize(0);
-
-	//btSolverBody& fixedBody = m_tmpSolverBodyPool.expand();
-    //initSolverBody(&fixedBody,0);
-
 	//convert all bodies
+    convertBodies(bodies, numBodies, infoGlobal);
 
+    convertJoints(constraints, numConstraints, infoGlobal);
 
-	for (int i=0;i<numBodies;i++)
-	{
-		int bodyId = getOrInitSolverBody(*bodies[i],infoGlobal.m_timeStep);
+	convertContacts(manifoldPtr,numManifolds,infoGlobal);
 
-		btRigidBody* body = btRigidBody::upcast(bodies[i]);
-		if (body && body->getInvMass())
-		{
-			btSolverBody& solverBody = m_tmpSolverBodyPool[bodyId];
-			btVector3 gyroForce (0,0,0);
-			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_EXPLICIT)
-			{
-				gyroForce = body->computeGyroscopicForceExplicit(infoGlobal.m_maxGyroscopicForce);
-				solverBody.m_externalTorqueImpulse -= gyroForce*body->getInvInertiaTensorWorld()*infoGlobal.m_timeStep;
-			}
-			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_WORLD)
-			{
-				gyroForce = body->computeGyroscopicImpulseImplicit_World(infoGlobal.m_timeStep);
-				solverBody.m_externalTorqueImpulse += gyroForce;
-			}
-			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_BODY)
-			{
-				gyroForce = body->computeGyroscopicImpulseImplicit_Body(infoGlobal.m_timeStep);
-				solverBody.m_externalTorqueImpulse += gyroForce;
-
-			}
-			
-
-		}
-	}
-
-	if (1)
-	{
-		int j;
-		for (j=0;j<numConstraints;j++)
-		{
-			btTypedConstraint* constraint = constraints[j];
-			constraint->buildJacobian();
-			constraint->internalSetAppliedImpulse(0.0f);
-		}
-	}
-
-	//btRigidBody* rb0=0,*rb1=0;
-
-	//if (1)
-	{
-		{
-
-			int totalNumRows = 0;
-			int i;
-
-			m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints);
-			//calculate the total number of contraint rows
-			for (i=0;i<numConstraints;i++)
-			{
-				btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
-				btJointFeedback* fb = constraints[i]->getJointFeedback();
-				if (fb)
-				{
-					fb->m_appliedForceBodyA.setZero();
-					fb->m_appliedTorqueBodyA.setZero();
-					fb->m_appliedForceBodyB.setZero();
-					fb->m_appliedTorqueBodyB.setZero();
-				}
-
-				if (constraints[i]->isEnabled())
-				{
-					constraints[i]->getInfo1(&info1);
-				} else
-				{
-					info1.m_numConstraintRows = 0;
-					info1.nub = 0;
-				}
-				totalNumRows += info1.m_numConstraintRows;
-			}
-			m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows);
-
-
-			///setup the btSolverConstraints
-			int currentRow = 0;
-
-			for (i=0;i<numConstraints;i++)
-			{
-				const btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
-
-				if (info1.m_numConstraintRows)
-				{
-					btAssert(currentRow<totalNumRows);
-
-					btSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[currentRow];
-					btTypedConstraint* constraint = constraints[i];
-					btRigidBody& rbA = constraint->getRigidBodyA();
-					btRigidBody& rbB = constraint->getRigidBodyB();
-
-					int solverBodyIdA = getOrInitSolverBody(rbA,infoGlobal.m_timeStep);
-                    int solverBodyIdB = getOrInitSolverBody(rbB,infoGlobal.m_timeStep);
-
-                    btSolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA];
-                    btSolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB];
-
-
-
-
-					int overrideNumSolverIterations = constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;
-					if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)
-						m_maxOverrideNumSolverIterations = overrideNumSolverIterations;
-
-
-					int j;
-					for ( j=0;j<info1.m_numConstraintRows;j++)
-					{
-						memset(&currentConstraintRow[j],0,sizeof(btSolverConstraint));
-						currentConstraintRow[j].m_lowerLimit = -SIMD_INFINITY;
-						currentConstraintRow[j].m_upperLimit = SIMD_INFINITY;
-						currentConstraintRow[j].m_appliedImpulse = 0.f;
-						currentConstraintRow[j].m_appliedPushImpulse = 0.f;
-						currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;
-						currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;
-						currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;
-					}
-
-					bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
-					bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
-					bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
-					bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
-					bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
-					bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
-					bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
-					bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
-
-
-					btTypedConstraint::btConstraintInfo2 info2;
-					info2.fps = 1.f/infoGlobal.m_timeStep;
-					info2.erp = infoGlobal.m_erp;
-					info2.m_J1linearAxis = currentConstraintRow->m_contactNormal1;
-					info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal;
-					info2.m_J2linearAxis = currentConstraintRow->m_contactNormal2;
-					info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal;
-					info2.rowskip = sizeof(btSolverConstraint)/sizeof(btScalar);//check this
-					///the size of btSolverConstraint needs be a multiple of btScalar
-		            btAssert(info2.rowskip*sizeof(btScalar)== sizeof(btSolverConstraint));
-					info2.m_constraintError = &currentConstraintRow->m_rhs;
-					currentConstraintRow->m_cfm = infoGlobal.m_globalCfm;
-					info2.m_damping = infoGlobal.m_damping;
-					info2.cfm = &currentConstraintRow->m_cfm;
-					info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;
-					info2.m_upperLimit = &currentConstraintRow->m_upperLimit;
-					info2.m_numIterations = infoGlobal.m_numIterations;
-					constraints[i]->getInfo2(&info2);
-
-					///finalize the constraint setup
-					for ( j=0;j<info1.m_numConstraintRows;j++)
-					{
-						btSolverConstraint& solverConstraint = currentConstraintRow[j];
-
-						if (solverConstraint.m_upperLimit>=constraints[i]->getBreakingImpulseThreshold())
-						{
-							solverConstraint.m_upperLimit = constraints[i]->getBreakingImpulseThreshold();
-						}
-
-						if (solverConstraint.m_lowerLimit<=-constraints[i]->getBreakingImpulseThreshold())
-						{
-							solverConstraint.m_lowerLimit = -constraints[i]->getBreakingImpulseThreshold();
-						}
-
-						solverConstraint.m_originalContactPoint = constraint;
-
-						{
-							const btVector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal;
-							solverConstraint.m_angularComponentA = constraint->getRigidBodyA().getInvInertiaTensorWorld()*ftorqueAxis1*constraint->getRigidBodyA().getAngularFactor();
-						}
-						{
-							const btVector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal;
-							solverConstraint.m_angularComponentB = constraint->getRigidBodyB().getInvInertiaTensorWorld()*ftorqueAxis2*constraint->getRigidBodyB().getAngularFactor();
-						}
-
-						{
-							btVector3 iMJlA = solverConstraint.m_contactNormal1*rbA.getInvMass();
-							btVector3 iMJaA = rbA.getInvInertiaTensorWorld()*solverConstraint.m_relpos1CrossNormal;
-							btVector3 iMJlB = solverConstraint.m_contactNormal2*rbB.getInvMass();//sign of normal?
-							btVector3 iMJaB = rbB.getInvInertiaTensorWorld()*solverConstraint.m_relpos2CrossNormal;
-
-							btScalar sum = iMJlA.dot(solverConstraint.m_contactNormal1);
-							sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
-							sum += iMJlB.dot(solverConstraint.m_contactNormal2);
-							sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
-							btScalar fsum = btFabs(sum);
-							btAssert(fsum > SIMD_EPSILON);
-							btScalar sorRelaxation = 1.f;//todo: get from globalInfo?
-							solverConstraint.m_jacDiagABInv = fsum>SIMD_EPSILON?sorRelaxation/sum : 0.f;
-						}
-
-
-
-						{
-							btScalar rel_vel;
-							btVector3 externalForceImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalForceImpulse : btVector3(0,0,0);
-							btVector3 externalTorqueImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalTorqueImpulse : btVector3(0,0,0);
-
-							btVector3 externalForceImpulseB = bodyBPtr->m_originalBody ? bodyBPtr->m_externalForceImpulse : btVector3(0,0,0);
-							btVector3 externalTorqueImpulseB = bodyBPtr->m_originalBody ?bodyBPtr->m_externalTorqueImpulse : btVector3(0,0,0);
-
-							btScalar vel1Dotn = solverConstraint.m_contactNormal1.dot(rbA.getLinearVelocity()+externalForceImpulseA)
-												+ solverConstraint.m_relpos1CrossNormal.dot(rbA.getAngularVelocity()+externalTorqueImpulseA);
-
-							btScalar vel2Dotn = solverConstraint.m_contactNormal2.dot(rbB.getLinearVelocity()+externalForceImpulseB)
-																+ solverConstraint.m_relpos2CrossNormal.dot(rbB.getAngularVelocity()+externalTorqueImpulseB);
-
-							rel_vel = vel1Dotn+vel2Dotn;
-							btScalar restitution = 0.f;
-							btScalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2
-							btScalar	velocityError = restitution - rel_vel * info2.m_damping;
-							btScalar	penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv;
-							btScalar	velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv;
-							solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
-							solverConstraint.m_appliedImpulse = 0.f;
-
-
-						}
-					}
-				}
-				currentRow+=m_tmpConstraintSizesPool[i].m_numConstraintRows;
-			}
-		}
-
-		convertContacts(manifoldPtr,numManifolds,infoGlobal);
-
-	}
 
 //	btContactSolverInfo info = infoGlobal;
 
@@ -1627,6 +1640,7 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 
 btScalar btSequentialImpulseConstraintSolver::solveSingleIteration(int iteration, btCollisionObject** /*bodies */,int /*numBodies*/,btPersistentManifold** /*manifoldPtr*/, int /*numManifolds*/,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* /*debugDrawer*/)
 {
+    BT_PROFILE("solveSingleIteration");
 	btScalar leastSquaresResidual = 0.f;
 
 	int numNonContactPool = m_tmpSolverNonContactConstraintPool.size();
@@ -1805,6 +1819,7 @@ btScalar btSequentialImpulseConstraintSolver::solveSingleIteration(int iteration
 
 void btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
 {
+	BT_PROFILE("solveGroupCacheFriendlySplitImpulseIterations");
 	int iteration;
 	if (infoGlobal.m_splitImpulse)
 	{
@@ -1863,14 +1878,9 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyIterations(
 	return 0.f;
 }
 
-btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal)
+void btSequentialImpulseConstraintSolver::writeBackContacts(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal)
 {
-	int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
-	int i,j;
-
-	if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
-	{
-		for (j=0;j<numPoolConstraints;j++)
+		for (int j=iBegin; j<iEnd; j++)
 		{
 			const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[j];
 			btManifoldPoint* pt = (btManifoldPoint*) solveManifold.m_originalContactPoint;
@@ -1886,10 +1896,11 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCo
 			}
 			//do a callback here?
 		}
-	}
+}
 
-	numPoolConstraints = m_tmpSolverNonContactConstraintPool.size();
-	for (j=0;j<numPoolConstraints;j++)
+void btSequentialImpulseConstraintSolver::writeBackJoints(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal)
+{
+	for (int j=iBegin; j<iEnd; j++)
 	{
 		const btSolverConstraint& solverConstr = m_tmpSolverNonContactConstraintPool[j];
 		btTypedConstraint* constr = (btTypedConstraint*)solverConstr.m_originalContactPoint;
@@ -1909,10 +1920,12 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCo
 			constr->setEnabled(false);
 		}
 	}
+}
 
 
-
-	for ( i=0;i<m_tmpSolverBodyPool.size();i++)
+void btSequentialImpulseConstraintSolver::writeBackBodies(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal)
+{
+	for (int i=iBegin; i<iEnd; i++)
 	{
 		btRigidBody* body = m_tmpSolverBodyPool[i].m_originalBody;
 		if (body)
@@ -1936,6 +1949,19 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCo
 			m_tmpSolverBodyPool[i].m_originalBody->setCompanionId(-1);
 		}
 	}
+}
+
+btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal)
+{
+	BT_PROFILE("solveGroupCacheFriendlyFinish");
+
+	if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
+	{
+        writeBackContacts(0, m_tmpSolverContactConstraintPool.size(), infoGlobal);
+	}
+
+    writeBackJoints(0, m_tmpSolverNonContactConstraintPool.size(), infoGlobal);
+    writeBackBodies(0, m_tmpSolverBodyPool.size(), infoGlobal);
 
 	m_tmpSolverContactConstraintPool.resizeNoInitialize(0);
 	m_tmpSolverNonContactConstraintPool.resizeNoInitialize(0);
diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
index 16c7eb74c..8c9c67f85 100644
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
@@ -95,6 +95,10 @@ protected:
 
 	void	convertContact(btPersistentManifold* manifold,const btContactSolverInfo& infoGlobal);
 
+    virtual void convertJoints(btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal);
+    void convertJoint(btSolverConstraint* destConstraintRow, btTypedConstraint* srcConstraint, const btTypedConstraint::btConstraintInfo1& info1, int solverBodyIdA, int solverBodyIdB, const btContactSolverInfo& infoGlobal);
+
+    virtual void convertBodies(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal);
 
 	btSimdScalar	resolveSplitPenetrationSIMD(btSolverBody& bodyA,btSolverBody& bodyB, const btSolverConstraint& contactConstraint)
     {
@@ -121,7 +125,9 @@ protected:
 		
 protected:
 	
-	
+    void writeBackContacts(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
+    void writeBackJoints(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
+    void writeBackBodies(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
 	virtual void solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
 	virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal);
 	virtual btScalar solveSingleIteration(int iteration, btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
new file mode 100644
index 000000000..b09665b15
--- /dev/null
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
@@ -0,0 +1,1611 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "btSequentialImpulseConstraintSolverMt.h"
+
+#include "LinearMath/btQuickprof.h"
+
+#include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"
+
+#include "BulletDynamics/ConstraintSolver/btTypedConstraint.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+
+
+
+bool btSequentialImpulseConstraintSolverMt::s_allowNestedParallelForLoops = false;  // some task schedulers don't like nested loops
+int btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching = 250;
+int btSequentialImpulseConstraintSolverMt::s_minBatchSize = 50;
+int btSequentialImpulseConstraintSolverMt::s_maxBatchSize = 100;
+btBatchedConstraints::BatchingMethod btSequentialImpulseConstraintSolverMt::s_contactBatchingMethod = btBatchedConstraints::BATCHING_METHOD_SPATIAL_GRID_2D;
+btBatchedConstraints::BatchingMethod btSequentialImpulseConstraintSolverMt::s_jointBatchingMethod = btBatchedConstraints::BATCHING_METHOD_SPATIAL_GRID_2D;
+
+
+btSequentialImpulseConstraintSolverMt::btSequentialImpulseConstraintSolverMt()
+{
+    m_numFrictionDirections = 1;
+    m_useBatching = false;
+    m_useObsoleteJointConstraints = false;
+}
+
+
+btSequentialImpulseConstraintSolverMt::~btSequentialImpulseConstraintSolverMt()
+{
+}
+
+
+void btSequentialImpulseConstraintSolverMt::setupBatchedContactConstraints()
+{
+    BT_PROFILE("setupBatchedContactConstraints");
+    m_batchedContactConstraints.setup( &m_tmpSolverContactConstraintPool,
+        m_tmpSolverBodyPool,
+        s_contactBatchingMethod,
+        s_minBatchSize,
+        s_maxBatchSize,
+        &m_scratchMemory
+    );
+}
+
+
+void btSequentialImpulseConstraintSolverMt::setupBatchedJointConstraints()
+{
+    BT_PROFILE("setupBatchedJointConstraints");
+    m_batchedJointConstraints.setup( &m_tmpSolverNonContactConstraintPool,
+        m_tmpSolverBodyPool,
+        s_jointBatchingMethod,
+        s_minBatchSize,
+        s_maxBatchSize,
+        &m_scratchMemory
+    );
+}
+
+
+void btSequentialImpulseConstraintSolverMt::internalSetupContactConstraints(int iContactConstraint, const btContactSolverInfo& infoGlobal)
+{
+    btSolverConstraint& contactConstraint = m_tmpSolverContactConstraintPool[iContactConstraint];
+
+    btVector3 rel_pos1;
+    btVector3 rel_pos2;
+    btScalar relaxation;
+
+    int solverBodyIdA = contactConstraint.m_solverBodyIdA;
+    int solverBodyIdB = contactConstraint.m_solverBodyIdB;
+
+    btSolverBody* solverBodyA = &m_tmpSolverBodyPool[ solverBodyIdA ];
+    btSolverBody* solverBodyB = &m_tmpSolverBodyPool[ solverBodyIdB ];
+
+    btRigidBody* colObj0 = solverBodyA->m_originalBody;
+    btRigidBody* colObj1 = solverBodyB->m_originalBody;
+
+    btManifoldPoint& cp = *static_cast<btManifoldPoint*>( contactConstraint.m_originalContactPoint );
+
+    const btVector3& pos1 = cp.getPositionWorldOnA();
+    const btVector3& pos2 = cp.getPositionWorldOnB();
+
+    rel_pos1 = pos1 - solverBodyA->getWorldTransform().getOrigin();
+    rel_pos2 = pos2 - solverBodyB->getWorldTransform().getOrigin();
+
+    btVector3 vel1;
+    btVector3 vel2;
+
+    solverBodyA->getVelocityInLocalPointNoDelta( rel_pos1, vel1 );
+    solverBodyB->getVelocityInLocalPointNoDelta( rel_pos2, vel2 );
+
+    btVector3 vel = vel1 - vel2;
+    btScalar rel_vel = cp.m_normalWorldOnB.dot( vel );
+
+    setupContactConstraint( contactConstraint, solverBodyIdA, solverBodyIdB, cp, infoGlobal, relaxation, rel_pos1, rel_pos2 );
+
+    // setup rolling friction constraints
+    int rollingFrictionIndex = m_rollingFrictionIndexTable[iContactConstraint];
+    if (rollingFrictionIndex >= 0)
+    {
+        btSolverConstraint& spinningFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[ rollingFrictionIndex ];
+        btAssert( spinningFrictionConstraint.m_frictionIndex == iContactConstraint );
+        setupTorsionalFrictionConstraint( spinningFrictionConstraint,
+            cp.m_normalWorldOnB,
+            solverBodyIdA,
+            solverBodyIdB,
+            cp,
+            cp.m_combinedSpinningFriction,
+            rel_pos1,
+            rel_pos2,
+            colObj0,
+            colObj1,
+            relaxation,
+            0.0f,
+            0.0f
+        );
+        btVector3 axis[2];
+        btPlaneSpace1( cp.m_normalWorldOnB, axis[0], axis[1] );
+        axis[0].normalize();
+        axis[1].normalize();
+
+        applyAnisotropicFriction( colObj0, axis[0], btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION );
+        applyAnisotropicFriction( colObj1, axis[0], btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION );
+        applyAnisotropicFriction( colObj0, axis[1], btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION );
+        applyAnisotropicFriction( colObj1, axis[1], btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION );
+        // put the largest axis first
+        if (axis[1].length2() > axis[0].length2())
+        {
+            btSwap(axis[0], axis[1]);
+        }
+        const btScalar kRollingFrictionThreshold = 0.001f;
+        for (int i = 0; i < 2; ++i)
+        {
+            int iRollingFric = rollingFrictionIndex + 1 + i;
+            btSolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[ iRollingFric ];
+            btAssert(rollingFrictionConstraint.m_frictionIndex == iContactConstraint);
+            btVector3 dir = axis[i];
+            if ( dir.length() > kRollingFrictionThreshold )
+            {
+                setupTorsionalFrictionConstraint( rollingFrictionConstraint,
+                    dir,
+                    solverBodyIdA,
+                    solverBodyIdB,
+                    cp,
+                    cp.m_combinedRollingFriction,
+                    rel_pos1,
+                    rel_pos2,
+                    colObj0,
+                    colObj1,
+                    relaxation,
+                    0.0f,
+                    0.0f
+                );
+            }
+            else
+            {
+                rollingFrictionConstraint.m_frictionIndex = -1;  // disable constraint
+            }
+        }
+    }
+
+    // setup friction constraints
+    //	setupFrictionConstraint(solverConstraint, normalAxis, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, infoGlobal, desiredVelocity, cfmSlip);
+    {
+        ///Bullet has several options to set the friction directions
+        ///By default, each contact has only a single friction direction that is recomputed automatically very frame
+        ///based on the relative linear velocity.
+        ///If the relative velocity it zero, it will automatically compute a friction direction.
+
+        ///You can also enable two friction directions, using the SOLVER_USE_2_FRICTION_DIRECTIONS.
+        ///In that case, the second friction direction will be orthogonal to both contact normal and first friction direction.
+        ///
+        ///If you choose SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION, then the friction will be independent from the relative projected velocity.
+        ///
+        ///The user can manually override the friction directions for certain contacts using a contact callback,
+        ///and set the cp.m_lateralFrictionInitialized to true
+        ///In that case, you can set the target relative motion in each friction direction (cp.m_contactMotion1 and cp.m_contactMotion2)
+        ///this will give a conveyor belt effect
+        ///
+	    btSolverConstraint* frictionConstraint1 = &m_tmpSolverContactFrictionConstraintPool[contactConstraint.m_frictionIndex];
+        btAssert(frictionConstraint1->m_frictionIndex == iContactConstraint);
+
+        btSolverConstraint* frictionConstraint2 = NULL;
+        if ( infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS )
+        {
+            frictionConstraint2 = &m_tmpSolverContactFrictionConstraintPool[contactConstraint.m_frictionIndex + 1];
+            btAssert( frictionConstraint2->m_frictionIndex == iContactConstraint );
+        }
+
+        if ( !( infoGlobal.m_solverMode & SOLVER_ENABLE_FRICTION_DIRECTION_CACHING ) || !( cp.m_contactPointFlags&BT_CONTACT_FLAG_LATERAL_FRICTION_INITIALIZED ) )
+        {
+            cp.m_lateralFrictionDir1 = vel - cp.m_normalWorldOnB * rel_vel;
+            btScalar lat_rel_vel = cp.m_lateralFrictionDir1.length2();
+            if ( !( infoGlobal.m_solverMode & SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION ) && lat_rel_vel > SIMD_EPSILON )
+            {
+                cp.m_lateralFrictionDir1 *= 1.f / btSqrt( lat_rel_vel );
+                applyAnisotropicFriction( colObj0, cp.m_lateralFrictionDir1, btCollisionObject::CF_ANISOTROPIC_FRICTION );
+                applyAnisotropicFriction( colObj1, cp.m_lateralFrictionDir1, btCollisionObject::CF_ANISOTROPIC_FRICTION );
+                setupFrictionConstraint( *frictionConstraint1, cp.m_lateralFrictionDir1, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, infoGlobal );
+
+                if ( frictionConstraint2 )
+                {
+                    cp.m_lateralFrictionDir2 = cp.m_lateralFrictionDir1.cross( cp.m_normalWorldOnB );
+                    cp.m_lateralFrictionDir2.normalize();//??
+                    applyAnisotropicFriction( colObj0, cp.m_lateralFrictionDir2, btCollisionObject::CF_ANISOTROPIC_FRICTION );
+                    applyAnisotropicFriction( colObj1, cp.m_lateralFrictionDir2, btCollisionObject::CF_ANISOTROPIC_FRICTION );
+                    setupFrictionConstraint( *frictionConstraint2, cp.m_lateralFrictionDir2, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, infoGlobal );
+                }
+            }
+            else
+            {
+                btPlaneSpace1( cp.m_normalWorldOnB, cp.m_lateralFrictionDir1, cp.m_lateralFrictionDir2 );
+
+                applyAnisotropicFriction( colObj0, cp.m_lateralFrictionDir1, btCollisionObject::CF_ANISOTROPIC_FRICTION );
+                applyAnisotropicFriction( colObj1, cp.m_lateralFrictionDir1, btCollisionObject::CF_ANISOTROPIC_FRICTION );
+                setupFrictionConstraint( *frictionConstraint1, cp.m_lateralFrictionDir1, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, infoGlobal );
+
+                if ( frictionConstraint2 )
+                {
+                    applyAnisotropicFriction( colObj0, cp.m_lateralFrictionDir2, btCollisionObject::CF_ANISOTROPIC_FRICTION );
+                    applyAnisotropicFriction( colObj1, cp.m_lateralFrictionDir2, btCollisionObject::CF_ANISOTROPIC_FRICTION );
+                    setupFrictionConstraint( *frictionConstraint2, cp.m_lateralFrictionDir2, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, infoGlobal );
+                }
+
+                if ( ( infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS ) && ( infoGlobal.m_solverMode & SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION ) )
+                {
+                    cp.m_contactPointFlags |= BT_CONTACT_FLAG_LATERAL_FRICTION_INITIALIZED;
+                }
+            }
+        }
+        else
+        {
+            setupFrictionConstraint( *frictionConstraint1, cp.m_lateralFrictionDir1, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, infoGlobal, cp.m_contactMotion1, cp.m_frictionCFM );
+            if ( frictionConstraint2 )
+            {
+                setupFrictionConstraint( *frictionConstraint2, cp.m_lateralFrictionDir2, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, infoGlobal, cp.m_contactMotion2, cp.m_frictionCFM );
+            }
+        }
+    }
+
+    setFrictionConstraintImpulse( contactConstraint, solverBodyIdA, solverBodyIdB, cp, infoGlobal );
+}
+
+
+struct SetupContactConstraintsLoop : public btIParallelForBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    const btBatchedConstraints* m_bc;
+    const btContactSolverInfo* m_infoGlobal;
+
+    SetupContactConstraintsLoop( btSequentialImpulseConstraintSolverMt* solver, const btBatchedConstraints* bc, const btContactSolverInfo& infoGlobal )
+    {
+        m_solver = solver;
+        m_bc = bc;
+        m_infoGlobal = &infoGlobal;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        BT_PROFILE( "SetupContactConstraintsLoop" );
+        for ( int iBatch = iBegin; iBatch < iEnd; ++iBatch )
+        {
+            const btBatchedConstraints::Range& batch = m_bc->m_batches[ iBatch ];
+            for (int i = batch.begin; i < batch.end; ++i)
+            {
+                int iContact = m_bc->m_constraintIndices[i];
+                m_solver->internalSetupContactConstraints( iContact, *m_infoGlobal );
+            }
+        }
+    }
+};
+
+
+void btSequentialImpulseConstraintSolverMt::setupAllContactConstraints(const btContactSolverInfo& infoGlobal)
+{
+    BT_PROFILE( "setupAllContactConstraints" );
+    if ( m_useBatching )
+    {
+        const btBatchedConstraints& batchedCons = m_batchedContactConstraints;
+        SetupContactConstraintsLoop loop( this, &batchedCons, infoGlobal );
+        for ( int iiPhase = 0; iiPhase < batchedCons.m_phases.size(); ++iiPhase )
+        {
+            int iPhase = batchedCons.m_phaseOrder[ iiPhase ];
+            const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ];
+            int grainSize = 1;
+            btParallelFor( phase.begin, phase.end, grainSize, loop );
+        }
+    }
+    else
+    {
+        for ( int i = 0; i < m_tmpSolverContactConstraintPool.size(); ++i )
+        {
+            internalSetupContactConstraints( i, infoGlobal );
+        }
+    }
+}
+
+
+int	btSequentialImpulseConstraintSolverMt::getOrInitSolverBodyThreadsafe(btCollisionObject& body,btScalar timeStep)
+{
+    //
+    // getOrInitSolverBody is threadsafe only for a single thread per solver (with potentially multiple solvers)
+    //
+    // getOrInitSolverBodyThreadsafe -- attempts to be fully threadsafe (however may affect determinism)
+    //
+    int solverBodyId = -1;
+    if ( !body.isStaticOrKinematicObject() )
+    {
+        // dynamic body
+        // Dynamic bodies can only be in one island, so it's safe to write to the companionId
+        solverBodyId = body.getCompanionId();
+        if ( solverBodyId < 0 )
+        {
+            m_bodySolverArrayMutex.lock();
+            // now that we have the lock, check again
+            solverBodyId = body.getCompanionId();
+            if ( solverBodyId < 0 )
+            {
+                if ( btRigidBody* rb = btRigidBody::upcast( &body ) )
+                {
+                    solverBodyId = m_tmpSolverBodyPool.size();
+                    btSolverBody& solverBody = m_tmpSolverBodyPool.expand();
+                    initSolverBody( &solverBody, &body, timeStep );
+                    body.setCompanionId( solverBodyId );
+                }
+            }
+            m_bodySolverArrayMutex.unlock();
+        }
+    }
+    else if (body.isKinematicObject())
+    {
+        //
+        // NOTE: must test for kinematic before static because some kinematic objects also
+        //   identify as "static"
+        //
+        // Kinematic bodies can be in multiple islands at once, so it is a
+        // race condition to write to them, so we use an alternate method
+        // to record the solverBodyId
+        int uniqueId = body.getWorldArrayIndex();
+        const int INVALID_SOLVER_BODY_ID = -1;
+        if (m_kinematicBodyUniqueIdToSolverBodyTable.size() <= uniqueId )
+        {
+            m_kinematicBodyUniqueIdToSolverBodyTableMutex.lock();
+            // now that we have the lock, check again
+            if ( m_kinematicBodyUniqueIdToSolverBodyTable.size() <= uniqueId )
+            {
+                m_kinematicBodyUniqueIdToSolverBodyTable.resize( uniqueId + 1, INVALID_SOLVER_BODY_ID );
+            }
+            m_kinematicBodyUniqueIdToSolverBodyTableMutex.unlock();
+        }
+        solverBodyId = m_kinematicBodyUniqueIdToSolverBodyTable[ uniqueId ];
+        // if no table entry yet,
+        if ( INVALID_SOLVER_BODY_ID == solverBodyId )
+        {
+            // need to acquire both locks
+            m_kinematicBodyUniqueIdToSolverBodyTableMutex.lock();
+            m_bodySolverArrayMutex.lock();
+            // now that we have the lock, check again
+            solverBodyId = m_kinematicBodyUniqueIdToSolverBodyTable[ uniqueId ];
+            if ( INVALID_SOLVER_BODY_ID == solverBodyId )
+            {
+                // create a table entry for this body
+                btRigidBody* rb = btRigidBody::upcast( &body );
+                solverBodyId = m_tmpSolverBodyPool.size();
+                btSolverBody& solverBody = m_tmpSolverBodyPool.expand();
+                initSolverBody( &solverBody, &body, timeStep );
+                m_kinematicBodyUniqueIdToSolverBodyTable[ uniqueId ] = solverBodyId;
+            }
+            m_bodySolverArrayMutex.unlock();
+            m_kinematicBodyUniqueIdToSolverBodyTableMutex.unlock();
+        }
+    }
+    else
+    {
+        // all fixed bodies (inf mass) get mapped to a single solver id
+        if ( m_fixedBodyId < 0 )
+        {
+            m_bodySolverArrayMutex.lock();
+            // now that we have the lock, check again
+            if ( m_fixedBodyId < 0 )
+            {
+                m_fixedBodyId = m_tmpSolverBodyPool.size();
+                btSolverBody& fixedBody = m_tmpSolverBodyPool.expand();
+                initSolverBody( &fixedBody, 0, timeStep );
+            }
+            m_bodySolverArrayMutex.unlock();
+        }
+        solverBodyId = m_fixedBodyId;
+    }
+    btAssert( solverBodyId < m_tmpSolverBodyPool.size() );
+	return solverBodyId;
+}
+
+
+void btSequentialImpulseConstraintSolverMt::internalCollectContactManifoldCachedInfo(btContactManifoldCachedInfo* cachedInfoArray, btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal)
+{
+    BT_PROFILE("internalCollectContactManifoldCachedInfo");
+    for (int i = 0; i < numManifolds; ++i)
+    {
+        btContactManifoldCachedInfo* cachedInfo = &cachedInfoArray[i];
+        btPersistentManifold* manifold = manifoldPtr[i];
+        btCollisionObject* colObj0 = (btCollisionObject*) manifold->getBody0();
+        btCollisionObject* colObj1 = (btCollisionObject*) manifold->getBody1();
+
+        int solverBodyIdA = getOrInitSolverBodyThreadsafe( *colObj0, infoGlobal.m_timeStep );
+        int solverBodyIdB = getOrInitSolverBodyThreadsafe( *colObj1, infoGlobal.m_timeStep );
+
+        cachedInfo->solverBodyIds[ 0 ] = solverBodyIdA;
+        cachedInfo->solverBodyIds[ 1 ] = solverBodyIdB;
+        cachedInfo->numTouchingContacts = 0;
+
+        btSolverBody* solverBodyA = &m_tmpSolverBodyPool[ solverBodyIdA ];
+        btSolverBody* solverBodyB = &m_tmpSolverBodyPool[ solverBodyIdB ];
+
+        ///avoid collision response between two static objects
+        if ( solverBodyA->m_invMass.fuzzyZero() && solverBodyB->m_invMass.fuzzyZero() )
+            break;
+
+        int iContact = 0;
+        for ( int j = 0; j < manifold->getNumContacts(); j++ )
+        {
+            btManifoldPoint& cp = manifold->getContactPoint( j );
+
+            if ( cp.getDistance() <= manifold->getContactProcessingThreshold() )
+            {
+                cachedInfo->contactPoints[ iContact ] = &cp;
+                cachedInfo->contactHasRollingFriction[ iContact ] = ( cp.m_combinedRollingFriction > 0.f );
+                iContact++;
+            }
+        }
+        cachedInfo->numTouchingContacts = iContact;
+    }
+}
+
+
+struct CollectContactManifoldCachedInfoLoop : public btIParallelForBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    btSequentialImpulseConstraintSolverMt::btContactManifoldCachedInfo* m_cachedInfoArray;
+    btPersistentManifold** m_manifoldPtr;
+    const btContactSolverInfo* m_infoGlobal;
+
+    CollectContactManifoldCachedInfoLoop( btSequentialImpulseConstraintSolverMt* solver, btSequentialImpulseConstraintSolverMt::btContactManifoldCachedInfo* cachedInfoArray, btPersistentManifold** manifoldPtr, const btContactSolverInfo& infoGlobal )
+    {
+        m_solver = solver;
+        m_cachedInfoArray = cachedInfoArray;
+        m_manifoldPtr = manifoldPtr;
+        m_infoGlobal = &infoGlobal;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        m_solver->internalCollectContactManifoldCachedInfo( m_cachedInfoArray + iBegin, m_manifoldPtr + iBegin, iEnd - iBegin, *m_infoGlobal );
+    }
+};
+
+
+void btSequentialImpulseConstraintSolverMt::internalAllocContactConstraints(const btContactManifoldCachedInfo* cachedInfoArray, int numManifolds)
+{
+    BT_PROFILE("internalAllocContactConstraints");
+    // possibly parallel part
+    for ( int iManifold = 0; iManifold < numManifolds; ++iManifold )
+    {
+        const btContactManifoldCachedInfo& cachedInfo = cachedInfoArray[ iManifold ];
+        int contactIndex = cachedInfo.contactIndex;
+        int frictionIndex = contactIndex * m_numFrictionDirections;
+        int rollingFrictionIndex = cachedInfo.rollingFrictionIndex;
+        for ( int i = 0; i < cachedInfo.numTouchingContacts; i++ )
+        {
+            btSolverConstraint& contactConstraint = m_tmpSolverContactConstraintPool[contactIndex];
+            contactConstraint.m_solverBodyIdA = cachedInfo.solverBodyIds[ 0 ];
+            contactConstraint.m_solverBodyIdB = cachedInfo.solverBodyIds[ 1 ];
+            contactConstraint.m_originalContactPoint = cachedInfo.contactPoints[ i ];
+
+            // allocate the friction constraints
+            contactConstraint.m_frictionIndex = frictionIndex;
+            for ( int iDir = 0; iDir < m_numFrictionDirections; ++iDir )
+            {
+                btSolverConstraint& frictionConstraint = m_tmpSolverContactFrictionConstraintPool[frictionIndex];
+                frictionConstraint.m_frictionIndex = contactIndex;
+                frictionIndex++;
+            }
+
+            // allocate rolling friction constraints
+            if ( cachedInfo.contactHasRollingFriction[ i ] )
+            {
+                m_rollingFrictionIndexTable[ contactIndex ] = rollingFrictionIndex;
+                // allocate 3 (although we may use only 2 sometimes)
+                for ( int i = 0; i < 3; i++ )
+                {
+                    m_tmpSolverContactRollingFrictionConstraintPool[ rollingFrictionIndex ].m_frictionIndex = contactIndex;
+                    rollingFrictionIndex++;
+                }
+            }
+            else
+            {
+                // indicate there is no rolling friction for this contact point
+                m_rollingFrictionIndexTable[ contactIndex ] = -1;
+            }
+            contactIndex++;
+        }
+    }
+}
+
+
+struct AllocContactConstraintsLoop : public btIParallelForBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    const btSequentialImpulseConstraintSolverMt::btContactManifoldCachedInfo* m_cachedInfoArray;
+
+    AllocContactConstraintsLoop( btSequentialImpulseConstraintSolverMt* solver, btSequentialImpulseConstraintSolverMt::btContactManifoldCachedInfo* cachedInfoArray )
+    {
+        m_solver = solver;
+        m_cachedInfoArray = cachedInfoArray;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        m_solver->internalAllocContactConstraints( m_cachedInfoArray + iBegin, iEnd - iBegin );
+    }
+};
+
+
+void btSequentialImpulseConstraintSolverMt::allocAllContactConstraints(btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal)
+{
+    BT_PROFILE( "allocAllContactConstraints" );
+    btAlignedObjectArray<btContactManifoldCachedInfo> cachedInfoArray; // = m_manifoldCachedInfoArray;
+    cachedInfoArray.resizeNoInitialize( numManifolds );
+    if (false)
+    {
+        // sequential
+        internalCollectContactManifoldCachedInfo(&cachedInfoArray[ 0 ], manifoldPtr, numManifolds, infoGlobal);
+    }
+    else
+    {
+        // may alter ordering of bodies which affects determinism
+        CollectContactManifoldCachedInfoLoop loop( this, &cachedInfoArray[ 0 ], manifoldPtr, infoGlobal );
+        int grainSize = 200;
+        btParallelFor( 0, numManifolds, grainSize, loop );
+    }
+
+    {
+        // serial part
+        int numContacts = 0;
+        int numRollingFrictionConstraints = 0;
+        for ( int iManifold = 0; iManifold < numManifolds; ++iManifold )
+        {
+            btContactManifoldCachedInfo& cachedInfo = cachedInfoArray[ iManifold ];
+            cachedInfo.contactIndex = numContacts;
+            cachedInfo.rollingFrictionIndex = numRollingFrictionConstraints;
+            numContacts += cachedInfo.numTouchingContacts;
+            for (int i = 0; i < cachedInfo.numTouchingContacts; ++i)
+            {
+                if (cachedInfo.contactHasRollingFriction[i])
+                {
+                    numRollingFrictionConstraints += 3;
+                }
+            }
+        }
+        m_tmpSolverContactConstraintPool.resizeNoInitialize(numContacts);
+        m_rollingFrictionIndexTable.resizeNoInitialize(numContacts);
+        m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize(numContacts*m_numFrictionDirections);
+        m_tmpSolverContactRollingFrictionConstraintPool.resizeNoInitialize(numRollingFrictionConstraints);
+    }
+    {
+        AllocContactConstraintsLoop loop(this, &cachedInfoArray[0]);
+        int grainSize = 200;
+        btParallelFor( 0, numManifolds, grainSize, loop );
+    }
+}
+
+
+void btSequentialImpulseConstraintSolverMt::convertContacts(btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal)
+{
+    if (!m_useBatching)
+    {
+        btSequentialImpulseConstraintSolver::convertContacts(manifoldPtr, numManifolds, infoGlobal);
+        return;
+    }
+    BT_PROFILE( "convertContacts" );
+    if (numManifolds > 0)
+    {
+        if ( m_fixedBodyId < 0 )
+        {
+            m_fixedBodyId = m_tmpSolverBodyPool.size();
+            btSolverBody& fixedBody = m_tmpSolverBodyPool.expand();
+            initSolverBody( &fixedBody, 0, infoGlobal.m_timeStep );
+        }
+        allocAllContactConstraints( manifoldPtr, numManifolds, infoGlobal );
+        if ( m_useBatching )
+        {
+            setupBatchedContactConstraints();
+        }
+        setupAllContactConstraints( infoGlobal );
+    }
+}
+
+
+void btSequentialImpulseConstraintSolverMt::internalInitMultipleJoints( btTypedConstraint** constraints, int iBegin, int iEnd )
+{
+    BT_PROFILE("internalInitMultipleJoints");
+    for ( int i = iBegin; i < iEnd; i++ )
+	{
+		btTypedConstraint* constraint = constraints[i];
+		btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
+		if (constraint->isEnabled())
+        {
+            constraint->buildJacobian();
+            constraint->internalSetAppliedImpulse( 0.0f );
+            btJointFeedback* fb = constraint->getJointFeedback();
+            if ( fb )
+            {
+                fb->m_appliedForceBodyA.setZero();
+                fb->m_appliedTorqueBodyA.setZero();
+                fb->m_appliedForceBodyB.setZero();
+                fb->m_appliedTorqueBodyB.setZero();
+            }
+            constraint->getInfo1( &info1 );
+        }
+        else
+		{
+			info1.m_numConstraintRows = 0;
+			info1.nub = 0;
+		}
+	}
+}
+
+
+struct InitJointsLoop : public btIParallelForBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    btTypedConstraint** m_constraints;
+
+    InitJointsLoop( btSequentialImpulseConstraintSolverMt* solver, btTypedConstraint** constraints )
+    {
+        m_solver = solver;
+        m_constraints = constraints;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        m_solver->internalInitMultipleJoints( m_constraints, iBegin, iEnd );
+    }
+};
+
+
+void btSequentialImpulseConstraintSolverMt::internalConvertMultipleJoints( const btAlignedObjectArray<JointParams>& jointParamsArray, btTypedConstraint** constraints, int iBegin, int iEnd, const btContactSolverInfo& infoGlobal )
+{
+    BT_PROFILE("internalConvertMultipleJoints");
+    for ( int i = iBegin; i < iEnd; ++i )
+    {
+        const JointParams& jointParams = jointParamsArray[ i ];
+        int currentRow = jointParams.m_solverConstraint;
+        if ( currentRow != -1 )
+        {
+            const btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[ i ];
+            btAssert( currentRow < m_tmpSolverNonContactConstraintPool.size() );
+            btAssert( info1.m_numConstraintRows > 0 );
+
+            btSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[ currentRow ];
+            btTypedConstraint* constraint = constraints[ i ];
+
+            convertJoint( currentConstraintRow, constraint, info1, jointParams.m_solverBodyA, jointParams.m_solverBodyB, infoGlobal );
+        }
+    }
+}
+
+
+struct ConvertJointsLoop : public btIParallelForBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    const btAlignedObjectArray<btSequentialImpulseConstraintSolverMt::JointParams>& m_jointParamsArray;
+    btTypedConstraint** m_srcConstraints;
+    const btContactSolverInfo& m_infoGlobal;
+
+    ConvertJointsLoop( btSequentialImpulseConstraintSolverMt* solver,
+        const btAlignedObjectArray<btSequentialImpulseConstraintSolverMt::JointParams>& jointParamsArray,
+        btTypedConstraint** srcConstraints,
+        const btContactSolverInfo& infoGlobal
+    ) :
+        m_jointParamsArray(jointParamsArray),
+        m_infoGlobal(infoGlobal)
+    {
+        m_solver = solver;
+        m_srcConstraints = srcConstraints;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        m_solver->internalConvertMultipleJoints( m_jointParamsArray, m_srcConstraints, iBegin, iEnd, m_infoGlobal );
+    }
+};
+
+
+void btSequentialImpulseConstraintSolverMt::convertJoints(btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& infoGlobal)
+{
+    if ( !m_useBatching )
+    {
+        btSequentialImpulseConstraintSolver::convertJoints(constraints, numConstraints, infoGlobal);
+        return;
+    }
+    BT_PROFILE("convertJoints");
+    bool parallelJointSetup = true;
+	m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints);
+    if (parallelJointSetup)
+    {
+        InitJointsLoop loop(this, constraints);
+        int grainSize = 40;
+        btParallelFor(0, numConstraints, grainSize, loop);
+    }
+    else
+    {
+        internalInitMultipleJoints( constraints, 0, numConstraints );
+    }
+
+	int totalNumRows = 0;
+    btAlignedObjectArray<JointParams> jointParamsArray;
+    jointParamsArray.resizeNoInitialize(numConstraints);
+
+	//calculate the total number of contraint rows
+	for (int i=0;i<numConstraints;i++)
+	{
+        btTypedConstraint* constraint = constraints[ i ];
+
+        JointParams& params = jointParamsArray[ i ];
+		const btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
+
+		if (info1.m_numConstraintRows)
+		{
+            params.m_solverConstraint = totalNumRows;
+            params.m_solverBodyA = getOrInitSolverBody( constraint->getRigidBodyA(), infoGlobal.m_timeStep );
+            params.m_solverBodyB = getOrInitSolverBody( constraint->getRigidBodyB(), infoGlobal.m_timeStep );
+		}
+        else
+		{
+            params.m_solverConstraint = -1;
+		}
+		totalNumRows += info1.m_numConstraintRows;
+	}
+	m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows);
+
+	///setup the btSolverConstraints
+    if ( parallelJointSetup )
+    {
+        ConvertJointsLoop loop(this, jointParamsArray, constraints, infoGlobal);
+        int grainSize = 20;
+        btParallelFor(0, numConstraints, grainSize, loop);
+    }
+    else
+    {
+        internalConvertMultipleJoints( jointParamsArray, constraints, 0, numConstraints, infoGlobal );
+    }
+    setupBatchedJointConstraints();
+}
+
+
+void btSequentialImpulseConstraintSolverMt::internalConvertBodies(btCollisionObject** bodies, int iBegin, int iEnd, const btContactSolverInfo& infoGlobal)
+{
+    BT_PROFILE("internalConvertBodies");
+    for (int i=iBegin; i < iEnd; i++)
+	{
+        btCollisionObject* obj = bodies[i];
+		obj->setCompanionId(i);
+		btSolverBody& solverBody = m_tmpSolverBodyPool[i];
+        initSolverBody(&solverBody, obj, infoGlobal.m_timeStep);
+
+		btRigidBody* body = btRigidBody::upcast(obj);
+		if (body && body->getInvMass())
+		{
+			btVector3 gyroForce (0,0,0);
+			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_EXPLICIT)
+			{
+				gyroForce = body->computeGyroscopicForceExplicit(infoGlobal.m_maxGyroscopicForce);
+				solverBody.m_externalTorqueImpulse -= gyroForce*body->getInvInertiaTensorWorld()*infoGlobal.m_timeStep;
+			}
+			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_WORLD)
+			{
+				gyroForce = body->computeGyroscopicImpulseImplicit_World(infoGlobal.m_timeStep);
+				solverBody.m_externalTorqueImpulse += gyroForce;
+			}
+			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_BODY)
+			{
+				gyroForce = body->computeGyroscopicImpulseImplicit_Body(infoGlobal.m_timeStep);
+				solverBody.m_externalTorqueImpulse += gyroForce;
+			}
+		}
+	}
+}
+
+
+struct ConvertBodiesLoop : public btIParallelForBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    btCollisionObject** m_bodies;
+    int m_numBodies;
+    const btContactSolverInfo& m_infoGlobal;
+
+    ConvertBodiesLoop( btSequentialImpulseConstraintSolverMt* solver,
+        btCollisionObject** bodies,
+        int numBodies,
+        const btContactSolverInfo& infoGlobal
+    ) :
+        m_infoGlobal(infoGlobal)
+    {
+        m_solver = solver;
+        m_bodies = bodies;
+        m_numBodies = numBodies;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        m_solver->internalConvertBodies( m_bodies, iBegin, iEnd, m_infoGlobal );
+    }
+};
+
+
+void btSequentialImpulseConstraintSolverMt::convertBodies(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal)
+{
+    BT_PROFILE("convertBodies");
+    m_kinematicBodyUniqueIdToSolverBodyTable.resize( 0 );
+
+	m_tmpSolverBodyPool.resizeNoInitialize(numBodies+1);
+
+    m_fixedBodyId = numBodies;
+    {
+        btSolverBody& fixedBody = m_tmpSolverBodyPool[ m_fixedBodyId ];
+        initSolverBody( &fixedBody, NULL, infoGlobal.m_timeStep );
+    }
+
+    bool parallelBodySetup = true;
+    if (parallelBodySetup)
+    {
+        ConvertBodiesLoop loop(this, bodies, numBodies, infoGlobal);
+        int grainSize = 40;
+        btParallelFor(0, numBodies, grainSize, loop);
+    }
+    else
+    {
+        internalConvertBodies( bodies, 0, numBodies, infoGlobal );
+    }
+}
+
+
+btScalar btSequentialImpulseConstraintSolverMt::solveGroupCacheFriendlySetup(
+     btCollisionObject** bodies,
+     int numBodies,
+     btPersistentManifold** manifoldPtr,
+     int numManifolds,
+     btTypedConstraint** constraints,
+     int numConstraints,
+     const btContactSolverInfo& infoGlobal,
+     btIDebugDraw* debugDrawer
+     )
+{
+    m_numFrictionDirections = (infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS) ? 2 : 1;
+    m_useBatching = false;
+    if ( numManifolds >= s_minimumContactManifoldsForBatching &&
+        (s_allowNestedParallelForLoops || !btThreadsAreRunning())
+        )
+    {
+        m_useBatching = true;
+        m_batchedContactConstraints.m_debugDrawer = debugDrawer;
+        m_batchedJointConstraints.m_debugDrawer = debugDrawer;
+    }
+    btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup( bodies,
+                                                                       numBodies,
+                                                                       manifoldPtr,
+                                                                       numManifolds,
+                                                                       constraints,
+                                                                       numConstraints,
+                                                                       infoGlobal,
+                                                                       debugDrawer
+                                                                       );
+    return 0.0f;
+}
+
+
+btScalar btSequentialImpulseConstraintSolverMt::resolveMultipleContactSplitPenetrationImpulseConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd )
+{
+    btScalar leastSquaresResidual = 0.f;
+    for ( int iiCons = batchBegin; iiCons < batchEnd; ++iiCons )
+    {
+        int iCons = consIndices[ iiCons ];
+        const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[ iCons ];
+        btSolverBody& bodyA = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdA ];
+        btSolverBody& bodyB = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdB ];
+        btScalar residual = resolveSplitPenetrationImpulse( bodyA, bodyB, solveManifold );
+        leastSquaresResidual += residual*residual;
+    }
+    return leastSquaresResidual;
+}
+
+
+struct ContactSplitPenetrationImpulseSolverLoop : public btIParallelSumBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    const btBatchedConstraints* m_bc;
+
+    ContactSplitPenetrationImpulseSolverLoop( btSequentialImpulseConstraintSolverMt* solver, const btBatchedConstraints* bc )
+    {
+        m_solver = solver;
+        m_bc = bc;
+    }
+    btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        BT_PROFILE( "ContactSplitPenetrationImpulseSolverLoop" );
+        btScalar sum = 0;
+        for ( int iBatch = iBegin; iBatch < iEnd; ++iBatch )
+        {
+            const btBatchedConstraints::Range& batch = m_bc->m_batches[ iBatch ];
+            sum += m_solver->resolveMultipleContactSplitPenetrationImpulseConstraints( m_bc->m_constraintIndices, batch.begin, batch.end );
+        }
+        return sum;
+    }
+};
+
+
+void btSequentialImpulseConstraintSolverMt::solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
+{
+	BT_PROFILE("solveGroupCacheFriendlySplitImpulseIterations");
+	if (infoGlobal.m_splitImpulse)
+	{
+        for ( int iteration = 0; iteration < infoGlobal.m_numIterations; iteration++ )
+        {
+            btScalar leastSquaresResidual = 0.f;
+            if (m_useBatching)
+            {
+                const btBatchedConstraints& batchedCons = m_batchedContactConstraints;
+                ContactSplitPenetrationImpulseSolverLoop loop( this, &batchedCons );
+                btScalar leastSquaresResidual = 0.f;
+                for ( int iiPhase = 0; iiPhase < batchedCons.m_phases.size(); ++iiPhase )
+                {
+                    int iPhase = batchedCons.m_phaseOrder[ iiPhase ];
+                    const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ];
+                    int grainSize = 8;
+                    leastSquaresResidual += btParallelSum( phase.begin, phase.end, grainSize, loop );
+                }
+            }
+            else
+            {
+                // non-batched
+                leastSquaresResidual = resolveMultipleContactSplitPenetrationImpulseConstraints(m_orderTmpConstraintPool, 0, m_tmpSolverContactConstraintPool.size());
+            }
+            if ( leastSquaresResidual <= infoGlobal.m_leastSquaresResidualThreshold || iteration >= ( infoGlobal.m_numIterations - 1 ) )
+            {
+#ifdef VERBOSE_RESIDUAL_PRINTF
+                printf( "residual = %f at iteration #%d\n", leastSquaresResidual, iteration );
+#endif
+                break;
+            }
+        }
+	}
+}
+
+
+btScalar btSequentialImpulseConstraintSolverMt::solveSingleIteration(int iteration, btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds, btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
+{
+    if ( !m_useBatching )
+    {
+        return btSequentialImpulseConstraintSolver::solveSingleIteration( iteration, bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer );
+    }
+    BT_PROFILE( "solveSingleIterationMt" );
+    btScalar leastSquaresResidual = 0.f;
+
+	if (infoGlobal.m_solverMode & SOLVER_RANDMIZE_ORDER)
+	{
+		if (1)			// uncomment this for a bit less random ((iteration & 7) == 0)
+		{
+            randomizeConstraintOrdering(iteration, infoGlobal.m_numIterations);
+		}
+	}
+
+	{
+		///solve all joint constraints
+        leastSquaresResidual += resolveAllJointConstraints(iteration);
+
+		if (iteration< infoGlobal.m_numIterations)
+		{
+            // this loop is only used for cone-twist constraints,
+            // it would be nice to skip this loop if none of the constraints need it
+            if ( m_useObsoleteJointConstraints )
+            {
+                for ( int j = 0; j<numConstraints; j++ )
+                {
+                    if ( constraints[ j ]->isEnabled() )
+                    {
+                        int bodyAid = getOrInitSolverBody( constraints[ j ]->getRigidBodyA(), infoGlobal.m_timeStep );
+                        int bodyBid = getOrInitSolverBody( constraints[ j ]->getRigidBodyB(), infoGlobal.m_timeStep );
+                        btSolverBody& bodyA = m_tmpSolverBodyPool[ bodyAid ];
+                        btSolverBody& bodyB = m_tmpSolverBodyPool[ bodyBid ];
+                        constraints[ j ]->solveConstraintObsolete( bodyA, bodyB, infoGlobal.m_timeStep );
+                    }
+                }
+            }
+
+			if (infoGlobal.m_solverMode & SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS)
+			{
+                // solve all contact, contact-friction, and rolling friction constraints interleaved
+                leastSquaresResidual += resolveAllContactConstraintsInterleaved();
+			}
+			else//SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS
+			{
+                // don't interleave them
+				// solve all contact constraints
+                leastSquaresResidual += resolveAllContactConstraints();
+
+				// solve all contact friction constraints
+                leastSquaresResidual += resolveAllContactFrictionConstraints();
+
+                // solve all rolling friction constraints
+                leastSquaresResidual += resolveAllRollingFrictionConstraints();
+			}
+		}
+	}
+    return leastSquaresResidual;
+}
+
+
+btScalar btSequentialImpulseConstraintSolverMt::resolveMultipleJointConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd, int iteration )
+{
+    btScalar leastSquaresResidual = 0.f;
+    for ( int iiCons = batchBegin; iiCons < batchEnd; ++iiCons )
+    {
+        int iCons = consIndices[ iiCons ];
+        const btSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[ iCons ];
+        if ( iteration < constraint.m_overrideNumSolverIterations )
+        {
+            btSolverBody& bodyA = m_tmpSolverBodyPool[ constraint.m_solverBodyIdA ];
+            btSolverBody& bodyB = m_tmpSolverBodyPool[ constraint.m_solverBodyIdB ];
+            btScalar residual = resolveSingleConstraintRowGeneric( bodyA, bodyB, constraint );
+            leastSquaresResidual += residual*residual;
+        }
+    }
+    return leastSquaresResidual;
+}
+
+
+btScalar btSequentialImpulseConstraintSolverMt::resolveMultipleContactConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd )
+{
+    btScalar leastSquaresResidual = 0.f;
+    for ( int iiCons = batchBegin; iiCons < batchEnd; ++iiCons )
+    {
+        int iCons = consIndices[ iiCons ];
+        const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[ iCons ];
+        btSolverBody& bodyA = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdA ];
+        btSolverBody& bodyB = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdB ];
+        btScalar residual = resolveSingleConstraintRowLowerLimit( bodyA, bodyB, solveManifold );
+        leastSquaresResidual += residual*residual;
+    }
+    return leastSquaresResidual;
+}
+
+
+btScalar btSequentialImpulseConstraintSolverMt::resolveMultipleContactFrictionConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd )
+{
+    btScalar leastSquaresResidual = 0.f;
+    for ( int iiCons = batchBegin; iiCons < batchEnd; ++iiCons )
+    {
+        int iContact = consIndices[ iiCons ];
+        btScalar totalImpulse = m_tmpSolverContactConstraintPool[ iContact ].m_appliedImpulse;
+
+        // apply sliding friction
+        if ( totalImpulse > 0.0f )
+        {
+            int iBegin = iContact * m_numFrictionDirections;
+            int iEnd = iBegin + m_numFrictionDirections;
+            for ( int iFriction = iBegin; iFriction < iEnd; ++iFriction )
+            {
+                btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[ iFriction++ ];
+                btAssert( solveManifold.m_frictionIndex == iContact );
+
+                solveManifold.m_lowerLimit = -( solveManifold.m_friction*totalImpulse );
+                solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
+
+                btSolverBody& bodyA = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdA ];
+                btSolverBody& bodyB = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdB ];
+                btScalar residual = resolveSingleConstraintRowGeneric( bodyA, bodyB, solveManifold );
+                leastSquaresResidual += residual*residual;
+            }
+        }
+    }
+    return leastSquaresResidual;
+}
+
+
+btScalar btSequentialImpulseConstraintSolverMt::resolveMultipleContactRollingFrictionConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd )
+{
+    btScalar leastSquaresResidual = 0.f;
+    for ( int iiCons = batchBegin; iiCons < batchEnd; ++iiCons )
+    {
+        int iContact = consIndices[ iiCons ];
+        int iFirstRollingFriction = m_rollingFrictionIndexTable[ iContact ];
+        if ( iFirstRollingFriction >= 0 )
+        {
+            btScalar totalImpulse = m_tmpSolverContactConstraintPool[ iContact ].m_appliedImpulse;
+            // apply rolling friction
+            if ( totalImpulse > 0.0f )
+            {
+                int iBegin = iFirstRollingFriction;
+                int iEnd = iBegin + 3;
+                for ( int iRollingFric = iBegin; iRollingFric < iEnd; ++iRollingFric )
+                {
+                    btSolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[ iRollingFric ];
+                    if ( rollingFrictionConstraint.m_frictionIndex != iContact )
+                    {
+                        break;
+                    }
+                    btScalar rollingFrictionMagnitude = rollingFrictionConstraint.m_friction*totalImpulse;
+                    if ( rollingFrictionMagnitude > rollingFrictionConstraint.m_friction )
+                    {
+                        rollingFrictionMagnitude = rollingFrictionConstraint.m_friction;
+                    }
+
+                    rollingFrictionConstraint.m_lowerLimit = -rollingFrictionMagnitude;
+                    rollingFrictionConstraint.m_upperLimit = rollingFrictionMagnitude;
+
+                    btScalar residual = resolveSingleConstraintRowGeneric( m_tmpSolverBodyPool[ rollingFrictionConstraint.m_solverBodyIdA ], m_tmpSolverBodyPool[ rollingFrictionConstraint.m_solverBodyIdB ], rollingFrictionConstraint );
+                    leastSquaresResidual += residual*residual;
+                }
+            }
+        }
+    }
+    return leastSquaresResidual;
+}
+
+
+btScalar btSequentialImpulseConstraintSolverMt::resolveMultipleContactConstraintsInterleaved( const btAlignedObjectArray<int>& contactIndices,
+                                                                                          int batchBegin,
+                                                                                          int batchEnd
+                                                                                          )
+{
+    btScalar leastSquaresResidual = 0.f;
+    int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+
+    for ( int iiCons = batchBegin; iiCons < batchEnd; iiCons++ )
+    {
+        btScalar totalImpulse = 0;
+        int iContact = contactIndices[ iiCons ];
+        // apply penetration constraint
+        {
+            const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[ iContact ];
+            btScalar residual = resolveSingleConstraintRowLowerLimit( m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdA ], m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdB ], solveManifold );
+            leastSquaresResidual += residual*residual;
+            totalImpulse = solveManifold.m_appliedImpulse;
+        }
+
+        // apply sliding friction
+        if ( totalImpulse > 0.0f )
+        {
+            int iBegin = iContact * m_numFrictionDirections;
+            int iEnd = iBegin + m_numFrictionDirections;
+            for ( int iFriction = iBegin; iFriction < iEnd; ++iFriction )
+            {
+                btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[ iFriction ];
+                btAssert( solveManifold.m_frictionIndex == iContact );
+
+                solveManifold.m_lowerLimit = -( solveManifold.m_friction*totalImpulse );
+                solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse;
+
+                btSolverBody& bodyA = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdA ];
+                btSolverBody& bodyB = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdB ];
+                btScalar residual = resolveSingleConstraintRowGeneric( bodyA, bodyB, solveManifold );
+                leastSquaresResidual += residual*residual;
+            }
+        }
+
+        // apply rolling friction
+        int iFirstRollingFriction = m_rollingFrictionIndexTable[ iContact ];
+        if ( totalImpulse > 0.0f && iFirstRollingFriction >= 0)
+        {
+            int iBegin = iFirstRollingFriction;
+            int iEnd = iBegin + 3;
+            for ( int iRollingFric = iBegin; iRollingFric < iEnd; ++iRollingFric )
+            {
+                btSolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[ iRollingFric ];
+                if ( rollingFrictionConstraint.m_frictionIndex != iContact )
+                {
+                    break;
+                }
+                btScalar rollingFrictionMagnitude = rollingFrictionConstraint.m_friction*totalImpulse;
+                if ( rollingFrictionMagnitude > rollingFrictionConstraint.m_friction )
+                {
+                    rollingFrictionMagnitude = rollingFrictionConstraint.m_friction;
+                }
+
+                rollingFrictionConstraint.m_lowerLimit = -rollingFrictionMagnitude;
+                rollingFrictionConstraint.m_upperLimit = rollingFrictionMagnitude;
+
+                btScalar residual = resolveSingleConstraintRowGeneric( m_tmpSolverBodyPool[ rollingFrictionConstraint.m_solverBodyIdA ], m_tmpSolverBodyPool[ rollingFrictionConstraint.m_solverBodyIdB ], rollingFrictionConstraint );
+                leastSquaresResidual += residual*residual;
+            }
+        }
+    }
+    return leastSquaresResidual;
+}
+
+
+void btSequentialImpulseConstraintSolverMt::randomizeBatchedConstraintOrdering( btBatchedConstraints* batchedConstraints )
+{
+    btBatchedConstraints& bc = *batchedConstraints;
+    // randomize ordering of phases
+    for ( int ii = 1; ii < bc.m_phaseOrder.size(); ++ii )
+    {
+        int iSwap = btRandInt2( ii + 1 );
+        bc.m_phaseOrder.swap( ii, iSwap );
+    }
+
+    // for each batch,
+    for ( int iBatch = 0; iBatch < bc.m_batches.size(); ++iBatch )
+    {
+        // randomize ordering of constraints within the batch
+        const btBatchedConstraints::Range& batch = bc.m_batches[ iBatch ];
+        for ( int iiCons = batch.begin; iiCons < batch.end; ++iiCons )
+        {
+            int iSwap = batch.begin + btRandInt2( iiCons - batch.begin + 1 );
+            btAssert(iSwap >= batch.begin && iSwap < batch.end);
+            bc.m_constraintIndices.swap( iiCons, iSwap );
+        }
+    }
+}
+
+
+void btSequentialImpulseConstraintSolverMt::randomizeConstraintOrdering(int iteration, int numIterations)
+{
+    // randomize ordering of joint constraints
+    randomizeBatchedConstraintOrdering( &m_batchedJointConstraints );
+
+    //contact/friction constraints are not solved more than numIterations
+    if ( iteration < numIterations )
+    {
+        randomizeBatchedConstraintOrdering( &m_batchedContactConstraints );
+    }
+}
+
+
+struct JointSolverLoop : public btIParallelSumBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    const btBatchedConstraints* m_bc;
+    int m_iteration;
+
+    JointSolverLoop( btSequentialImpulseConstraintSolverMt* solver, const btBatchedConstraints* bc, int iteration )
+    {
+        m_solver = solver;
+        m_bc = bc;
+        m_iteration = iteration;
+    }
+    btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        BT_PROFILE( "JointSolverLoop" );
+        btScalar sum = 0;
+        for ( int iBatch = iBegin; iBatch < iEnd; ++iBatch )
+        {
+            const btBatchedConstraints::Range& batch = m_bc->m_batches[ iBatch ];
+            sum += m_solver->resolveMultipleJointConstraints( m_bc->m_constraintIndices, batch.begin, batch.end, m_iteration );
+        }
+        return sum;
+    }
+};
+
+
+btScalar btSequentialImpulseConstraintSolverMt::resolveAllJointConstraints(int iteration)
+{
+    BT_PROFILE( "resolveAllJointConstraints" );
+    const btBatchedConstraints& batchedCons = m_batchedJointConstraints;
+    JointSolverLoop loop( this, &batchedCons, iteration );
+    btScalar leastSquaresResidual = 0.f;
+    for ( int iiPhase = 0; iiPhase < batchedCons.m_phases.size(); ++iiPhase )
+    {
+        int iPhase = batchedCons.m_phaseOrder[ iiPhase ];
+        const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ];
+        int grainSize = 1;
+        leastSquaresResidual += btParallelSum( phase.begin, phase.end, grainSize, loop );
+    }
+    return leastSquaresResidual;
+}
+
+
+struct ContactSolverLoop : public btIParallelSumBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    const btBatchedConstraints* m_bc;
+
+    ContactSolverLoop( btSequentialImpulseConstraintSolverMt* solver, const btBatchedConstraints* bc )
+    {
+        m_solver = solver;
+        m_bc = bc;
+    }
+    btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        BT_PROFILE( "ContactSolverLoop" );
+        btScalar sum = 0;
+        for ( int iBatch = iBegin; iBatch < iEnd; ++iBatch )
+        {
+            const btBatchedConstraints::Range& batch = m_bc->m_batches[ iBatch ];
+            sum += m_solver->resolveMultipleContactConstraints( m_bc->m_constraintIndices, batch.begin, batch.end );
+        }
+        return sum;
+    }
+};
+
+
+btScalar btSequentialImpulseConstraintSolverMt::resolveAllContactConstraints()
+{
+    BT_PROFILE( "resolveAllContactConstraints" );
+    const btBatchedConstraints& batchedCons = m_batchedContactConstraints;
+    ContactSolverLoop loop( this, &batchedCons );
+    btScalar leastSquaresResidual = 0.f;
+    for ( int iiPhase = 0; iiPhase < batchedCons.m_phases.size(); ++iiPhase )
+    {
+        int iPhase = batchedCons.m_phaseOrder[ iiPhase ];
+        const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ];
+        int grainSize = batchedCons.m_phaseGrainSize[iPhase];
+        leastSquaresResidual += btParallelSum( phase.begin, phase.end, grainSize, loop );
+    }
+    return leastSquaresResidual;
+}
+
+
+struct ContactFrictionSolverLoop : public btIParallelSumBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    const btBatchedConstraints* m_bc;
+
+    ContactFrictionSolverLoop( btSequentialImpulseConstraintSolverMt* solver, const btBatchedConstraints* bc )
+    {
+        m_solver = solver;
+        m_bc = bc;
+    }
+    btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        BT_PROFILE( "ContactFrictionSolverLoop" );
+        btScalar sum = 0;
+        for ( int iBatch = iBegin; iBatch < iEnd; ++iBatch )
+        {
+            const btBatchedConstraints::Range& batch = m_bc->m_batches[ iBatch ];
+            sum += m_solver->resolveMultipleContactFrictionConstraints( m_bc->m_constraintIndices, batch.begin, batch.end );
+        }
+        return sum;
+    }
+};
+
+
+btScalar btSequentialImpulseConstraintSolverMt::resolveAllContactFrictionConstraints()
+{
+    BT_PROFILE( "resolveAllContactFrictionConstraints" );
+    const btBatchedConstraints& batchedCons = m_batchedContactConstraints;
+    ContactFrictionSolverLoop loop( this, &batchedCons );
+    btScalar leastSquaresResidual = 0.f;
+    for ( int iiPhase = 0; iiPhase < batchedCons.m_phases.size(); ++iiPhase )
+    {
+        int iPhase = batchedCons.m_phaseOrder[ iiPhase ];
+        const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ];
+        int grainSize = batchedCons.m_phaseGrainSize[iPhase];
+        leastSquaresResidual += btParallelSum( phase.begin, phase.end, grainSize, loop );
+    }
+    return leastSquaresResidual;
+}
+
+
+struct InterleavedContactSolverLoop : public btIParallelSumBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    const btBatchedConstraints* m_bc;
+
+    InterleavedContactSolverLoop( btSequentialImpulseConstraintSolverMt* solver, const btBatchedConstraints* bc )
+    {
+        m_solver = solver;
+        m_bc = bc;
+    }
+    btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        BT_PROFILE( "InterleavedContactSolverLoop" );
+        btScalar sum = 0;
+        for ( int iBatch = iBegin; iBatch < iEnd; ++iBatch )
+        {
+            const btBatchedConstraints::Range& batch = m_bc->m_batches[ iBatch ];
+            sum += m_solver->resolveMultipleContactConstraintsInterleaved( m_bc->m_constraintIndices, batch.begin, batch.end );
+        }
+        return sum;
+    }
+};
+
+
+btScalar btSequentialImpulseConstraintSolverMt::resolveAllContactConstraintsInterleaved()
+{
+    BT_PROFILE( "resolveAllContactConstraintsInterleaved" );
+    const btBatchedConstraints& batchedCons = m_batchedContactConstraints;
+    InterleavedContactSolverLoop loop( this, &batchedCons );
+    btScalar leastSquaresResidual = 0.f;
+    for ( int iiPhase = 0; iiPhase < batchedCons.m_phases.size(); ++iiPhase )
+    {
+        int iPhase = batchedCons.m_phaseOrder[ iiPhase ];
+        const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ];
+        int grainSize = 1;
+        leastSquaresResidual += btParallelSum( phase.begin, phase.end, grainSize, loop );
+    }
+    return leastSquaresResidual;
+}
+
+
+struct ContactRollingFrictionSolverLoop : public btIParallelSumBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    const btBatchedConstraints* m_bc;
+
+    ContactRollingFrictionSolverLoop( btSequentialImpulseConstraintSolverMt* solver, const btBatchedConstraints* bc )
+    {
+        m_solver = solver;
+        m_bc = bc;
+    }
+    btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        BT_PROFILE( "ContactFrictionSolverLoop" );
+        btScalar sum = 0;
+        for ( int iBatch = iBegin; iBatch < iEnd; ++iBatch )
+        {
+            const btBatchedConstraints::Range& batch = m_bc->m_batches[ iBatch ];
+            sum += m_solver->resolveMultipleContactRollingFrictionConstraints( m_bc->m_constraintIndices, batch.begin, batch.end );
+        }
+        return sum;
+    }
+};
+
+
+btScalar btSequentialImpulseConstraintSolverMt::resolveAllRollingFrictionConstraints()
+{
+    BT_PROFILE( "resolveAllRollingFrictionConstraints" );
+    btScalar leastSquaresResidual = 0.f;
+    //
+    // We do not generate batches for rolling friction constraints. We assume that
+    // one of two cases is true:
+    //
+    //  1. either most bodies in the simulation have rolling friction, in which case we can use the
+    //     batches for contacts and use a lookup table to translate contact indices to rolling friction
+    //     (ignoring any contact indices that don't map to a rolling friction constraint). As long as
+    //     most contacts have a corresponding rolling friction constraint, this should parallelize well.
+    //
+    //  -OR-
+    //
+    //  2. few bodies in the simulation have rolling friction, so it is not worth trying to use the
+    //     batches from contacts as most of the contacts won't have corresponding rolling friction
+    //     constraints and most threads would end up doing very little work. Most of the time would
+    //     go to threading overhead, so we don't bother with threading.
+    //
+    int numRollingFrictionPoolConstraints = m_tmpSolverContactRollingFrictionConstraintPool.size();
+    if (numRollingFrictionPoolConstraints >= m_tmpSolverContactConstraintPool.size())
+    {
+        // use batching if there are many rolling friction constraints
+        const btBatchedConstraints& batchedCons = m_batchedContactConstraints;
+        ContactRollingFrictionSolverLoop loop( this, &batchedCons );
+        btScalar leastSquaresResidual = 0.f;
+        for ( int iiPhase = 0; iiPhase < batchedCons.m_phases.size(); ++iiPhase )
+        {
+            int iPhase = batchedCons.m_phaseOrder[ iiPhase ];
+            const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ];
+            int grainSize = 1;
+            leastSquaresResidual += btParallelSum( phase.begin, phase.end, grainSize, loop );
+        }
+    }
+    else
+    {
+        // no batching, also ignores SOLVER_RANDMIZE_ORDER
+        for ( int j = 0; j < numRollingFrictionPoolConstraints; j++ )
+        {
+            btSolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[ j ];
+            if ( rollingFrictionConstraint.m_frictionIndex >= 0 )
+            {
+                btScalar totalImpulse = m_tmpSolverContactConstraintPool[ rollingFrictionConstraint.m_frictionIndex ].m_appliedImpulse;
+                if ( totalImpulse > 0.0f )
+                {
+                    btScalar rollingFrictionMagnitude = rollingFrictionConstraint.m_friction*totalImpulse;
+                    if ( rollingFrictionMagnitude > rollingFrictionConstraint.m_friction )
+                        rollingFrictionMagnitude = rollingFrictionConstraint.m_friction;
+
+                    rollingFrictionConstraint.m_lowerLimit = -rollingFrictionMagnitude;
+                    rollingFrictionConstraint.m_upperLimit = rollingFrictionMagnitude;
+
+                    btScalar residual = resolveSingleConstraintRowGeneric( m_tmpSolverBodyPool[ rollingFrictionConstraint.m_solverBodyIdA ], m_tmpSolverBodyPool[ rollingFrictionConstraint.m_solverBodyIdB ], rollingFrictionConstraint );
+                    leastSquaresResidual += residual*residual;
+                }
+            }
+        }
+    }
+    return leastSquaresResidual;
+}
+
+
+void btSequentialImpulseConstraintSolverMt::internalWriteBackContacts( int iBegin, int iEnd, const btContactSolverInfo& infoGlobal )
+{
+    BT_PROFILE("internalWriteBackContacts");
+    writeBackContacts(iBegin, iEnd, infoGlobal);
+    //for ( int iContact = iBegin; iContact < iEnd; ++iContact)
+    //{
+    //    const btSolverConstraint& contactConstraint = m_tmpSolverContactConstraintPool[ iContact ];
+    //    btManifoldPoint* pt = (btManifoldPoint*) contactConstraint.m_originalContactPoint;
+    //    btAssert( pt );
+    //    pt->m_appliedImpulse = contactConstraint.m_appliedImpulse;
+    //    pt->m_appliedImpulseLateral1 = m_tmpSolverContactFrictionConstraintPool[ contactConstraint.m_frictionIndex ].m_appliedImpulse;
+    //    if ( m_numFrictionDirections == 2 )
+    //    {
+    //        pt->m_appliedImpulseLateral2 = m_tmpSolverContactFrictionConstraintPool[ contactConstraint.m_frictionIndex + 1 ].m_appliedImpulse;
+    //    }
+    //}
+}
+
+
+void btSequentialImpulseConstraintSolverMt::internalWriteBackJoints( int iBegin, int iEnd, const btContactSolverInfo& infoGlobal )
+{
+	BT_PROFILE("internalWriteBackJoints");
+    writeBackJoints(iBegin, iEnd, infoGlobal);
+}
+
+
+void btSequentialImpulseConstraintSolverMt::internalWriteBackBodies( int iBegin, int iEnd, const btContactSolverInfo& infoGlobal )
+{
+	BT_PROFILE("internalWriteBackBodies");
+    writeBackBodies( iBegin, iEnd, infoGlobal );
+}
+
+
+struct WriteContactPointsLoop : public btIParallelForBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    const btContactSolverInfo* m_infoGlobal;
+
+    WriteContactPointsLoop( btSequentialImpulseConstraintSolverMt* solver, const btContactSolverInfo& infoGlobal )
+    {
+        m_solver = solver;
+        m_infoGlobal = &infoGlobal;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        m_solver->internalWriteBackContacts( iBegin, iEnd, *m_infoGlobal );
+    }
+};
+
+
+struct WriteJointsLoop : public btIParallelForBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    const btContactSolverInfo* m_infoGlobal;
+
+    WriteJointsLoop( btSequentialImpulseConstraintSolverMt* solver, const btContactSolverInfo& infoGlobal )
+    {
+        m_solver = solver;
+        m_infoGlobal = &infoGlobal;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        m_solver->internalWriteBackJoints( iBegin, iEnd, *m_infoGlobal );
+    }
+};
+
+
+struct WriteBodiesLoop : public btIParallelForBody
+{
+    btSequentialImpulseConstraintSolverMt* m_solver;
+    const btContactSolverInfo* m_infoGlobal;
+
+    WriteBodiesLoop( btSequentialImpulseConstraintSolverMt* solver, const btContactSolverInfo& infoGlobal )
+    {
+        m_solver = solver;
+        m_infoGlobal = &infoGlobal;
+    }
+    void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+    {
+        m_solver->internalWriteBackBodies( iBegin, iEnd, *m_infoGlobal );
+    }
+};
+
+
+btScalar btSequentialImpulseConstraintSolverMt::solveGroupCacheFriendlyFinish(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal)
+{
+	BT_PROFILE("solveGroupCacheFriendlyFinish");
+
+	if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
+    {
+        WriteContactPointsLoop loop( this, infoGlobal );
+        int grainSize = 500;
+        btParallelFor( 0, m_tmpSolverContactConstraintPool.size(), grainSize, loop );
+    }
+
+    {
+        WriteJointsLoop loop( this, infoGlobal );
+        int grainSize = 400;
+        btParallelFor( 0, m_tmpSolverNonContactConstraintPool.size(), grainSize, loop );
+    }
+    {
+        WriteBodiesLoop loop( this, infoGlobal );
+        int grainSize = 100;
+        btParallelFor( 0, m_tmpSolverBodyPool.size(), grainSize, loop );
+    }
+
+	m_tmpSolverContactConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverNonContactConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverContactRollingFrictionConstraintPool.resizeNoInitialize(0);
+
+	m_tmpSolverBodyPool.resizeNoInitialize(0);
+	return 0.f;
+}
+
diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h
new file mode 100644
index 000000000..0577d8d2d
--- /dev/null
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h
@@ -0,0 +1,154 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_SEQUENTIAL_IMPULSE_CONSTRAINT_SOLVER_MT_H
+#define BT_SEQUENTIAL_IMPULSE_CONSTRAINT_SOLVER_MT_H
+
+#include "btSequentialImpulseConstraintSolver.h"
+#include "btBatchedConstraints.h"
+#include "LinearMath/btThreads.h"
+
+///
+/// btSequentialImpulseConstraintSolverMt
+///
+///  A multithreaded variant of the sequential impulse constraint solver. The constraints to be solved are grouped into
+///  batches and phases where each batch of constraints within a given phase can be solved in parallel with the rest.
+///  Ideally we want as few phases as possible, and each phase should have many batches, and all of the batches should
+///  have about the same number of constraints.
+///  This method works best on a large island of many constraints.
+///
+///  Supports all of the features of the normal sequential impulse solver such as:
+///    - split penetration impulse
+///    - rolling friction
+///    - interleaving constraints
+///    - warmstarting
+///    - 2 friction directions
+///    - randomized constraint ordering
+///    - early termination when leastSquaresResidualThreshold is satisfied
+///
+///  When the SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS flag is enabled, unlike the normal SequentialImpulse solver,
+///  the rolling friction is interleaved as well.
+///  Interleaving the contact penetration constraints with friction reduces the number of parallel loops that need to be done,
+///  which reduces threading overhead so it can be a performance win, however, it does seem to produce a less stable simulation,
+///  at least on stacks of blocks.
+///
+///  When the SOLVER_RANDMIZE_ORDER flag is enabled, the ordering of phases, and the ordering of constraints within each batch
+///  is randomized, however it does not swap constraints between batches.
+///  This is to avoid regenerating the batches for each solver iteration which would be quite costly in performance.
+///
+///  Note that a non-zero leastSquaresResidualThreshold could possibly affect the determinism of the simulation
+///  if the task scheduler's parallelSum operation is non-deterministic. The parallelSum operation can be non-deterministic
+///  because floating point addition is not associative due to rounding errors.
+///  The task scheduler can and should ensure that the result of any parallelSum operation is deterministic.
+///
+ATTRIBUTE_ALIGNED16(class) btSequentialImpulseConstraintSolverMt : public btSequentialImpulseConstraintSolver
+{
+public:
+	virtual void solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE;
+	virtual btScalar solveSingleIteration(int iteration, btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE;
+	virtual btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE;
+	virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal) BT_OVERRIDE;
+
+    // temp struct used to collect info from persistent manifolds into a cache-friendly struct using multiple threads
+    struct btContactManifoldCachedInfo
+    {
+        static const int MAX_NUM_CONTACT_POINTS = 4;
+
+        int numTouchingContacts;
+        int solverBodyIds[ 2 ];
+        int contactIndex;
+        int rollingFrictionIndex;
+        bool contactHasRollingFriction[ MAX_NUM_CONTACT_POINTS ];
+        btManifoldPoint* contactPoints[ MAX_NUM_CONTACT_POINTS ];
+    };
+    // temp struct used for setting up joint constraints in parallel
+    struct JointParams
+    {
+        int m_solverConstraint;
+        int m_solverBodyA;
+        int m_solverBodyB;
+    };
+    void internalInitMultipleJoints(btTypedConstraint** constraints, int iBegin, int iEnd);
+    void internalConvertMultipleJoints( const btAlignedObjectArray<JointParams>& jointParamsArray, btTypedConstraint** constraints, int iBegin, int iEnd, const btContactSolverInfo& infoGlobal );
+
+    // parameters to control batching
+    static bool s_allowNestedParallelForLoops;        // whether to allow nested parallel operations
+    static int s_minimumContactManifoldsForBatching;  // don't even try to batch if fewer manifolds than this
+    static btBatchedConstraints::BatchingMethod s_contactBatchingMethod;
+    static btBatchedConstraints::BatchingMethod s_jointBatchingMethod;
+    static int s_minBatchSize;  // desired number of constraints per batch
+    static int s_maxBatchSize;
+
+protected:
+    static const int CACHE_LINE_SIZE = 64;
+
+    btBatchedConstraints m_batchedContactConstraints;
+    btBatchedConstraints m_batchedJointConstraints;
+    int m_numFrictionDirections;
+    bool m_useBatching;
+    bool m_useObsoleteJointConstraints;
+    btAlignedObjectArray<btContactManifoldCachedInfo> m_manifoldCachedInfoArray;
+    btAlignedObjectArray<int> m_rollingFrictionIndexTable;  // lookup table mapping contact index to rolling friction index
+    btSpinMutex m_bodySolverArrayMutex;
+    char m_antiFalseSharingPadding[CACHE_LINE_SIZE]; // padding to keep mutexes in separate cachelines
+    btSpinMutex m_kinematicBodyUniqueIdToSolverBodyTableMutex;
+    btAlignedObjectArray<char> m_scratchMemory;
+
+    virtual void randomizeConstraintOrdering( int iteration, int numIterations );
+    virtual btScalar resolveAllJointConstraints( int iteration );
+    virtual btScalar resolveAllContactConstraints();
+    virtual btScalar resolveAllContactFrictionConstraints();
+    virtual btScalar resolveAllContactConstraintsInterleaved();
+    virtual btScalar resolveAllRollingFrictionConstraints();
+
+    virtual void setupBatchedContactConstraints();
+    virtual void setupBatchedJointConstraints();
+    virtual void convertJoints(btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal) BT_OVERRIDE;
+	virtual void convertContacts(btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal) BT_OVERRIDE;
+    virtual void convertBodies(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal) BT_OVERRIDE;
+
+	int getOrInitSolverBodyThreadsafe(btCollisionObject& body, btScalar timeStep);
+    void allocAllContactConstraints(btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal);
+    void setupAllContactConstraints(const btContactSolverInfo& infoGlobal);
+    void randomizeBatchedConstraintOrdering( btBatchedConstraints* batchedConstraints );
+
+public:
+
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+	
+	btSequentialImpulseConstraintSolverMt();
+	virtual ~btSequentialImpulseConstraintSolverMt();
+
+    btScalar resolveMultipleJointConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd, int iteration );
+    btScalar resolveMultipleContactConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
+    btScalar resolveMultipleContactSplitPenetrationImpulseConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
+    btScalar resolveMultipleContactFrictionConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
+    btScalar resolveMultipleContactRollingFrictionConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
+    btScalar resolveMultipleContactConstraintsInterleaved( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
+
+    void internalCollectContactManifoldCachedInfo(btContactManifoldCachedInfo* cachedInfoArray, btPersistentManifold** manifold, int numManifolds, const btContactSolverInfo& infoGlobal);
+    void internalAllocContactConstraints(const btContactManifoldCachedInfo* cachedInfoArray, int numManifolds);
+    void internalSetupContactConstraints(int iContact, const btContactSolverInfo& infoGlobal);
+    void internalConvertBodies(btCollisionObject** bodies, int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
+    void internalWriteBackContacts(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
+    void internalWriteBackJoints(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
+    void internalWriteBackBodies(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
+};
+
+
+
+
+#endif //BT_SEQUENTIAL_IMPULSE_CONSTRAINT_SOLVER_MT_H
+
diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
index 1d10bad92..330bccb87 100644
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
@@ -325,3 +325,14 @@ void btDiscreteDynamicsWorldMt::integrateTransforms( btScalar timeStep )
     }
 }
 
+
+int	btDiscreteDynamicsWorldMt::stepSimulation( btScalar timeStep, int maxSubSteps, btScalar fixedTimeStep )
+{
+    int numSubSteps = btDiscreteDynamicsWorld::stepSimulation(timeStep, maxSubSteps, fixedTimeStep);
+    if (btITaskScheduler* scheduler = btGetTaskScheduler())
+    {
+        // tell Bullet's threads to sleep, so other threads can run
+        scheduler->sleepWorkerThreadsHint();
+    }
+    return numSubSteps;
+}
diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
index 2f144cdda..a8cc22dd0 100644
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
@@ -129,6 +129,8 @@ public:
         btCollisionConfiguration* collisionConfiguration
     );
 	virtual ~btDiscreteDynamicsWorldMt();
+
+    virtual int	stepSimulation( btScalar timeStep, int maxSubSteps, btScalar fixedTimeStep ) BT_OVERRIDE;
 };
 
 #endif //BT_DISCRETE_DYNAMICS_WORLD_H
diff --git a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
index 65e1a140e..54ac39aaf 100644
--- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
+++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
@@ -22,6 +22,7 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletCollision/CollisionDispatch/btCollisionWorld.h"
 #include "BulletDynamics/ConstraintSolver/btTypedConstraint.h"
+#include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h"  // for s_minimumContactManifoldsForBatching
 
 //#include <stdio.h>
 #include "LinearMath/btQuickprof.h"
@@ -589,14 +590,52 @@ struct UpdateIslandDispatcher : public btIParallelForBody
     }
 };
 
+
 void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback )
 {
     BT_PROFILE( "parallelIslandDispatch" );
-    int grainSize = 1;  // iterations per task
+    //
+    // if there are islands with many contacts, it may be faster to submit these
+    // large islands *serially* to a single parallel constraint solver, and then later
+    // submit the remaining smaller islands in parallel to multiple sequential solvers.
+    //
+    // Some task schedulers do not deal well with nested parallelFor loops. One implementation
+    // of OpenMP was actually slower than doing everything single-threaded. Intel TBB
+    // on the other hand, seems to do a pretty respectable job with it.
+    //
+    // When solving islands in parallel, the worst case performance happens when there
+    // is one very large island and then perhaps a smattering of very small
+    // islands -- one worker thread takes the large island and the remaining workers
+    // tear through the smaller islands and then sit idle waiting for the first worker
+    // to finish. Solving islands in parallel works best when there are numerous small
+    // islands, roughly equal in size.
+    //
+    // By contrast, the other approach -- the parallel constraint solver -- is only
+    // able to deliver a worthwhile speedup when the island is large. For smaller islands,
+    // it is difficult to extract a useful amount of parallelism -- the overhead of grouping
+    // the constraints into batches and sending the batches to worker threads can nullify
+    // any gains from parallelism.
+    //
+
     UpdateIslandDispatcher dispatcher;
     dispatcher.islandsPtr = islandsPtr;
     dispatcher.callback = callback;
-    btParallelFor( 0, islandsPtr->size(), grainSize, dispatcher );
+    // We take advantage of the fact the islands are sorted in order of decreasing size
+    int iBegin = 0;
+    while (iBegin < islandsPtr->size())
+    {
+        btSimulationIslandManagerMt::Island* island = (*islandsPtr)[ iBegin ];
+        if (island->manifoldArray.size() < btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching)
+        {
+            // OK to submit the rest of the array in parallel
+            break;
+        }
+        ++iBegin;
+    }
+    // serial dispatch for large islands (if any)
+    dispatcher.forLoop(0, iBegin);
+    // parallel dispatch for rest
+    btParallelFor( iBegin, islandsPtr->size(), 1, dispatcher );
 }
 
 
diff --git a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
index 9a781aaef..31a2053b4 100644
--- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
+++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
@@ -106,5 +106,7 @@ public:
     }
 };
 
+extern int gLargeIslandManifoldCount;
+
 #endif //BT_SIMULATION_ISLAND_MANAGER_H
 
diff --git a/src/LinearMath/CMakeLists.txt b/src/LinearMath/CMakeLists.txt
index ede21d9a7..0c8c0133a 100644
--- a/src/LinearMath/CMakeLists.txt
+++ b/src/LinearMath/CMakeLists.txt
@@ -14,6 +14,9 @@ SET(LinearMath_SRCS
 	btSerializer64.cpp
 	btThreads.cpp
 	btVector3.cpp
+	TaskScheduler/btTaskScheduler.cpp
+	TaskScheduler/btThreadSupportPosix.cpp
+	TaskScheduler/btThreadSupportWin32.cpp
 )
 
 SET(LinearMath_HDRS
@@ -44,6 +47,7 @@ SET(LinearMath_HDRS
 	btTransform.h
 	btTransformUtil.h
 	btVector3.h
+	TaskScheduler/btThreadSupportInterface.h
 )
 
 ADD_LIBRARY(LinearMath ${LinearMath_SRCS} ${LinearMath_HDRS})
diff --git a/src/LinearMath/TaskScheduler/btTaskScheduler.cpp b/src/LinearMath/TaskScheduler/btTaskScheduler.cpp
new file mode 100644
index 000000000..e02458367
--- /dev/null
+++ b/src/LinearMath/TaskScheduler/btTaskScheduler.cpp
@@ -0,0 +1,619 @@
+
+#include "LinearMath/btMinMax.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "LinearMath/btThreads.h"
+#include "LinearMath/btQuickprof.h"
+#include <stdio.h>
+#include <algorithm>
+
+
+typedef void( *btThreadFunc )( void* userPtr, void* lsMemory );
+typedef void* ( *btThreadLocalStorageFunc )();
+
+#if BT_THREADSAFE
+
+#include "btThreadSupportInterface.h"
+
+
+
+
+///
+/// getNumHardwareThreads()
+///
+///
+/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
+///
+#if __cplusplus >= 201103L
+
+#include <thread>
+
+int getNumHardwareThreads()
+{
+    return std::thread::hardware_concurrency();
+}
+
+#elif defined( _WIN32 )
+
+#define WIN32_LEAN_AND_MEAN
+
+#include <windows.h>
+
+int getNumHardwareThreads()
+{
+    // caps out at 32
+    SYSTEM_INFO info;
+    GetSystemInfo( &info );
+    return info.dwNumberOfProcessors;
+}
+
+#else
+
+int getNumHardwareThreads()
+{
+    return 0;  // don't know
+}
+
+#endif
+
+
+void btSpinPause()
+{
+#if defined( _WIN32 )
+    YieldProcessor();
+#endif
+}
+
+
+struct WorkerThreadStatus
+{
+    enum Type
+    {
+        kInvalid,
+        kWaitingForWork,
+        kWorking,
+        kSleeping,
+    };
+};
+
+
+struct IJob
+{
+    virtual void executeJob(int threadId) = 0;
+};
+
+class ParallelForJob : public IJob
+{
+    const btIParallelForBody* mBody;
+    int mBegin;
+    int mEnd;
+
+public:
+    ParallelForJob( int iBegin, int iEnd, const btIParallelForBody& body )
+    {
+        mBody = &body;
+        mBegin = iBegin;
+        mEnd = iEnd;
+    }
+    virtual void executeJob(int threadId) BT_OVERRIDE
+    {
+        BT_PROFILE( "executeJob" );
+
+        // call the functor body to do the work
+        mBody->forLoop( mBegin, mEnd );
+    }
+};
+
+static const int kCacheLineSize = 64;
+
+struct ThreadLocalSum
+{
+    btScalar mSum;
+    char mCachePadding[ kCacheLineSize - sizeof( btScalar ) ];
+};
+
+class ParallelSumJob : public IJob
+{
+    const btIParallelSumBody* mBody;
+    ThreadLocalSum* mSumArray;
+    int mBegin;
+    int mEnd;
+
+public:
+    ParallelSumJob( int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalSum* sums )
+    {
+        mBody = &body;
+        mSumArray = sums;
+        mBegin = iBegin;
+        mEnd = iEnd;
+    }
+    virtual void executeJob( int threadId ) BT_OVERRIDE
+    {
+        BT_PROFILE( "executeJob" );
+
+        // call the functor body to do the work
+        btScalar val = mBody->sumLoop( mBegin, mEnd );
+        // by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision)
+        const float TRUNC_SCALE = float(1<<19);
+        val = floor(val*TRUNC_SCALE+0.5f)/TRUNC_SCALE;  // truncate some bits
+        mSumArray[threadId].mSum += val;
+    }
+};
+
+
+struct JobContext
+{
+    JobContext()
+    {
+        m_queueLock = NULL;
+        m_headIndex = 0;
+        m_tailIndex = 0;
+        m_workersShouldCheckQueue = false;
+        m_workersShouldSleep = false;
+        m_useSpinMutex = false;
+        m_coolDownTime = 1000; // 1000 microseconds
+    }
+    btCriticalSection* m_queueLock;
+    btSpinMutex m_mutex;
+    volatile bool m_workersShouldCheckQueue;
+    volatile bool m_workersShouldSleep;
+
+    btAlignedObjectArray<IJob*> m_jobQueue;
+    bool m_queueIsEmpty;
+    int m_tailIndex;
+    int m_headIndex;
+    bool m_useSpinMutex;
+    unsigned int m_coolDownTime;
+    btClock m_clock;
+
+    void lockQueue()
+    {
+        if ( m_useSpinMutex )
+        {
+            m_mutex.lock();
+        }
+        else
+        {
+            m_queueLock->lock();
+        }
+    }
+    void unlockQueue()
+    {
+        if ( m_useSpinMutex )
+        {
+            m_mutex.unlock();
+        }
+        else
+        {
+            m_queueLock->unlock();
+        }
+    }
+    void clearQueue()
+    {
+        lockQueue();
+        m_headIndex = 0;
+        m_tailIndex = 0;
+        m_queueIsEmpty = true;
+        unlockQueue();
+        m_jobQueue.resizeNoInitialize( 0 );
+    }
+    void submitJob( IJob* job )
+    {
+        m_jobQueue.push_back( job );
+        lockQueue();
+        m_tailIndex++;
+        m_queueIsEmpty = false;
+        unlockQueue();
+    }
+    IJob* consumeJob()
+    {
+        if ( m_queueIsEmpty )
+        {
+            // lock free path. even if this is taken erroneously it isn't harmful
+            return NULL;
+        }
+        IJob* job = NULL;
+        lockQueue();
+        if ( !m_queueIsEmpty )
+        {
+            job = m_jobQueue[ m_headIndex++ ];
+            if ( m_headIndex == m_tailIndex )
+            {
+                m_queueIsEmpty = true;
+            }
+        }
+        unlockQueue();
+        return job;
+    }
+};
+
+
+struct WorkerThreadLocalStorage
+{
+    int threadId;
+    WorkerThreadStatus::Type status;
+    int numJobsFinished;
+    btSpinMutex m_mutex;
+};
+
+
+static void WorkerThreadFunc( void* userPtr, void* lsMemory )
+{
+    BT_PROFILE( "WorkerThreadFunc" );
+    WorkerThreadLocalStorage* localStorage = (WorkerThreadLocalStorage*) lsMemory;
+    JobContext* jobContext = (JobContext*) userPtr;
+
+    bool shouldSleep = false;
+    while (! shouldSleep)
+    {
+        // do work
+        localStorage->m_mutex.lock();
+        while ( IJob* job = jobContext->consumeJob() )
+        {
+            localStorage->status = WorkerThreadStatus::kWorking;
+            job->executeJob( localStorage->threadId );
+            localStorage->numJobsFinished++;
+        }
+        localStorage->status = WorkerThreadStatus::kWaitingForWork;
+        localStorage->m_mutex.unlock();
+        unsigned long long int clockStart = jobContext->m_clock.getTimeMicroseconds();
+        // while queue is empty,
+        while (jobContext->m_queueIsEmpty)
+        {
+            // todo: spin wait a bit to avoid hammering the empty queue
+            btSpinPause();
+            if ( jobContext->m_workersShouldSleep )
+            {
+                shouldSleep = true;
+                break;
+            }
+            // if jobs are incoming,
+            if (jobContext->m_workersShouldCheckQueue)
+            {
+                clockStart = jobContext->m_clock.getTimeMicroseconds(); // reset clock
+            }
+            else
+            {
+                // if no jobs incoming and queue has been empty for the cooldown time, sleep
+                unsigned long long int timeElapsed = jobContext->m_clock.getTimeMicroseconds() - clockStart;
+                if (timeElapsed > jobContext->m_coolDownTime)
+                {
+                    shouldSleep = true;
+                    break;
+                }
+            }
+        }
+    }
+
+    // go idle
+    localStorage->m_mutex.lock();
+    localStorage->status = WorkerThreadStatus::kSleeping;
+    localStorage->m_mutex.unlock();
+}
+
+
+static void* WorkerThreadAllocFunc()
+{
+    return new WorkerThreadLocalStorage;
+}
+
+
+
+class btTaskSchedulerDefault : public btITaskScheduler
+{
+    JobContext m_jobContext;
+    btThreadSupportInterface* m_threadSupport;
+    btAlignedObjectArray<char> m_jobMem;
+    btAlignedObjectArray<char> m_threadLocalMem;
+    btSpinMutex m_antiNestingLock;  // prevent nested parallel-for
+    int m_numThreads;
+    int m_numWorkerThreads;
+    int m_maxNumThreads;
+    int m_numJobs;
+public:
+
+    btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport")
+    {
+        m_threadSupport = NULL;
+    }
+
+    virtual ~btTaskSchedulerDefault()
+    {
+        shutdown();
+    }
+
+    void init()
+    {
+        btThreadSupportInterface::ConstructionInfo constructionInfo( "TaskScheduler", WorkerThreadFunc, WorkerThreadAllocFunc );
+        m_threadSupport = btThreadSupportInterface::create( constructionInfo );
+
+        m_numWorkerThreads = m_threadSupport->getNumWorkerThreads();
+        m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1;
+        m_numThreads = m_maxNumThreads;
+        m_jobContext.m_queueLock = m_threadSupport->createCriticalSection();
+        for ( int i = 0; i < m_numWorkerThreads; i++ )
+        {
+            WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i );
+            btAssert( storage );
+            storage->threadId = i + 1;  // workers start at 1
+            storage->status = WorkerThreadStatus::kSleeping;
+        }
+        setWorkersActive( false ); // no work for them yet
+        setNumThreads( m_threadSupport->getCacheFriendlyNumThreads() );
+    }
+
+    virtual void shutdown()
+    {
+        setWorkersActive( false );
+        waitForWorkersToSleep();
+        m_threadSupport->deleteCriticalSection( m_jobContext.m_queueLock );
+        m_jobContext.m_queueLock = NULL;
+
+        delete m_threadSupport;
+        m_threadSupport = NULL;
+    }
+
+    void setWorkersActive( bool active )
+    {
+        m_jobContext.m_workersShouldCheckQueue = active;
+    }
+
+    virtual int getMaxNumThreads() const BT_OVERRIDE
+    {
+        return m_maxNumThreads;
+    }
+
+    virtual int getNumThreads() const BT_OVERRIDE
+    {
+        return m_numThreads;
+    }
+
+    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
+    {
+        m_numThreads = btMax( btMin(numThreads, int(m_maxNumThreads)), 1 );
+        m_numWorkerThreads = m_numThreads - 1;
+    }
+
+    void waitJobs()
+    {
+        BT_PROFILE( "waitJobs" );
+        // have the main thread work until the job queue is empty
+        int numMainThreadJobsFinished = 0;
+        while ( IJob* job = m_jobContext.consumeJob() )
+        {
+            job->executeJob( 0 );
+            numMainThreadJobsFinished++;
+        }
+        // done with jobs for now, tell workers to rest
+        setWorkersActive( false );
+
+        unsigned long long int clockStart = m_jobContext.m_clock.getTimeMicroseconds();
+        // wait for workers to finish any jobs in progress
+        while ( true )
+        {
+            int numWorkerJobsFinished = 0;
+            for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker )
+            {
+                WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory( iWorker ) );
+                storage->m_mutex.lock();
+                numWorkerJobsFinished += storage->numJobsFinished;
+                storage->m_mutex.unlock();
+            }
+            if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs)
+            {
+                break;
+            }
+            unsigned long long int timeElapsed = m_jobContext.m_clock.getTimeMicroseconds() - clockStart;
+            btAssert(timeElapsed < 1000);
+            if (timeElapsed > 100000)
+            {
+                break;
+            }
+            btSpinPause();
+        }
+    }
+
+    void wakeWorkers(int numWorkersToWake)
+    {
+        BT_PROFILE( "wakeWorkers" );
+        btAssert( m_jobContext.m_workersShouldCheckQueue );
+        int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads);
+        int numActiveWorkers = 0;
+        for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker )
+        {
+            // note this count of active workers is not necessarily totally reliable, because a worker thread could be
+            // just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare.
+            WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory( iWorker ) );
+            if (storage->status != WorkerThreadStatus::kSleeping)
+            {
+                numActiveWorkers++;
+            }
+        }
+        for ( int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker )
+        {
+            WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory( iWorker ) );
+            if (storage->status == WorkerThreadStatus::kSleeping)
+            {
+                m_threadSupport->runTask( iWorker, &m_jobContext );
+                numActiveWorkers++;
+            }
+        }
+    }
+
+    void waitForWorkersToSleep()
+    {
+        BT_PROFILE( "waitForWorkersToSleep" );
+        m_jobContext.m_workersShouldSleep = true;
+        m_threadSupport->waitForAllTasks();
+        for ( int i = 0; i < m_numWorkerThreads; i++ )
+        {
+            WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory(i) );
+            btAssert( storage );
+            btAssert( storage->status == WorkerThreadStatus::kSleeping );
+        }
+    }
+
+    virtual void sleepWorkerThreadsHint() BT_OVERRIDE
+    {
+        BT_PROFILE( "sleepWorkerThreadsHint" );
+        // hint the task scheduler that we may not be using these threads for a little while
+        m_jobContext.m_workersShouldSleep = true;
+    }
+
+    void prepareWorkerThreads()
+    {
+        for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker )
+        {
+            WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory( iWorker ) );
+            storage->m_mutex.lock();
+            storage->numJobsFinished = 0;
+            storage->m_mutex.unlock();
+        }
+        m_jobContext.m_workersShouldSleep = false;
+        setWorkersActive( true );
+    }
+
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelFor_ThreadSupport" );
+        btAssert( iEnd >= iBegin );
+        btAssert( grainSize >= 1 );
+        int iterationCount = iEnd - iBegin;
+        if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() )
+        {
+            typedef ParallelForJob JobType;
+            int jobCount = ( iterationCount + grainSize - 1 ) / grainSize;
+            m_numJobs = jobCount;
+            btAssert( jobCount >= 2 );  // need more than one job for multithreading
+            int jobSize = sizeof( JobType );
+            int jobBufSize = jobSize * jobCount;
+            // make sure we have enough memory allocated to store jobs
+            if ( jobBufSize > m_jobMem.size() )
+            {
+                m_jobMem.resize( jobBufSize );
+            }
+            // make sure job queue is big enough
+            if ( jobCount > m_jobContext.m_jobQueue.capacity() )
+            {
+                m_jobContext.m_jobQueue.reserve( jobCount );
+            }
+
+            m_jobContext.clearQueue();
+            // prepare worker threads for incoming work
+            prepareWorkerThreads();
+            // submit all of the jobs
+            int iJob = 0;
+            JobType* jobs = reinterpret_cast<JobType*>( &m_jobMem[ 0 ] );
+            for ( int i = iBegin; i < iEnd; i += grainSize )
+            {
+                btAssert( iJob < jobCount );
+                int iE = btMin( i + grainSize, iEnd );
+                JobType& job = jobs[ iJob ];
+                new ( (void*) &job ) ParallelForJob( i, iE, body );  // placement new
+                m_jobContext.submitJob( &job );
+                iJob++;
+            }
+            wakeWorkers( jobCount - 1 );
+
+            // put the main thread to work on emptying the job queue and then wait for all workers to finish
+            waitJobs();
+            m_antiNestingLock.unlock();
+        }
+        else
+        {
+            BT_PROFILE( "parallelFor_mainThread" );
+            // just run on main thread
+            body.forLoop( iBegin, iEnd );
+        }
+    }
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelSum_ThreadSupport" );
+        btAssert( iEnd >= iBegin );
+        btAssert( grainSize >= 1 );
+        int iterationCount = iEnd - iBegin;
+        if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() )
+        {
+            typedef ParallelSumJob JobType;
+            int jobCount = ( iterationCount + grainSize - 1 ) / grainSize;
+            m_numJobs = jobCount;
+            btAssert( jobCount >= 2 );  // need more than one job for multithreading
+            int jobSize = sizeof( JobType );
+            int jobBufSize = jobSize * jobCount;
+            // make sure we have enough memory allocated to store jobs
+            if ( jobBufSize > m_jobMem.size() )
+            {
+                m_jobMem.resize( jobBufSize );
+            }
+            // make sure job queue is big enough
+            if ( jobCount > m_jobContext.m_jobQueue.capacity() )
+            {
+                m_jobContext.m_jobQueue.reserve( jobCount );
+            }
+            // make sure thread local area is big enough
+            int threadLocalSize = m_numThreads * sizeof( ThreadLocalSum );
+            if ( threadLocalSize > m_threadLocalMem.size() )
+            {
+                m_threadLocalMem.resize( threadLocalSize );
+            }
+            // initialize summation
+            ThreadLocalSum* threadLocalSum = reinterpret_cast<ThreadLocalSum*>( &m_threadLocalMem[ 0 ] );
+            for ( int iThread = 0; iThread < m_numThreads; ++iThread )
+            {
+                threadLocalSum[ iThread ].mSum = btScalar( 0 );
+            }
+
+            m_jobContext.clearQueue();
+            // prepare worker threads for incoming work
+            prepareWorkerThreads();
+            // submit all of the jobs
+            int iJob = 0;
+            JobType* jobs = reinterpret_cast<JobType*>( &m_jobMem[ 0 ] );
+            for ( int i = iBegin; i < iEnd; i += grainSize )
+            {
+                btAssert( iJob < jobCount );
+                int iE = btMin( i + grainSize, iEnd );
+                JobType& job = jobs[ iJob ];
+                new ( (void*) &job ) ParallelSumJob( i, iE, body, threadLocalSum );  // placement new
+                m_jobContext.submitJob( &job );
+                iJob++;
+            }
+            wakeWorkers( jobCount - 1 );
+
+            // put the main thread to work on emptying the job queue and then wait for all workers to finish
+            waitJobs();
+            m_antiNestingLock.unlock();
+
+            // add up all the thread sums
+            btScalar sum = btScalar(0);
+            for ( int iThread = 0; iThread < m_numThreads; ++iThread )
+            {
+                sum += threadLocalSum[ iThread ].mSum;
+            }
+            return sum;
+        }
+        else
+        {
+            BT_PROFILE( "parallelSum_mainThread" );
+            // just run on main thread
+            return body.sumLoop( iBegin, iEnd );
+        }
+    }
+};
+
+
+
+btITaskScheduler* btCreateDefaultTaskScheduler()
+{
+    btTaskSchedulerDefault* ts = new btTaskSchedulerDefault();
+    ts->init();
+    return ts;
+}
+
+#else // #if BT_THREADSAFE
+
+btITaskScheduler* btCreateDefaultTaskScheduler()
+{
+    return NULL;
+}
+
+#endif // #else // #if BT_THREADSAFE
\ No newline at end of file
diff --git a/src/LinearMath/TaskScheduler/btThreadSupportInterface.h b/src/LinearMath/TaskScheduler/btThreadSupportInterface.h
new file mode 100644
index 000000000..d537d7095
--- /dev/null
+++ b/src/LinearMath/TaskScheduler/btThreadSupportInterface.h
@@ -0,0 +1,75 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2018 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_THREAD_SUPPORT_INTERFACE_H
+#define BT_THREAD_SUPPORT_INTERFACE_H
+
+
+
+class btCriticalSection
+{
+public:
+    btCriticalSection() {}
+    virtual ~btCriticalSection() {}
+
+    virtual void lock() = 0;
+    virtual void unlock() = 0;
+};
+
+
+class btThreadSupportInterface
+{
+public:
+
+    virtual ~btThreadSupportInterface() {}
+
+    virtual int getNumWorkerThreads() const = 0;  // number of worker threads (total number of logical processors - 1)
+    virtual int getCacheFriendlyNumThreads() const = 0;  // the number of logical processors sharing a single L3 cache
+    virtual void runTask( int threadIndex, void* userData ) = 0;
+    virtual void waitForAllTasks() = 0;
+
+    virtual btCriticalSection* createCriticalSection() = 0;
+    virtual void deleteCriticalSection( btCriticalSection* criticalSection ) = 0;
+
+    virtual void* getThreadLocalMemory( int taskId ) { return NULL; }
+
+    typedef void( *ThreadFunc )( void* userPtr, void* lsMemory );
+    typedef void* ( *MemorySetupFunc )( );
+
+    struct ConstructionInfo
+    {
+        ConstructionInfo( const char* uniqueName,
+            ThreadFunc userThreadFunc,
+            MemorySetupFunc	lsMemoryFunc,
+            int threadStackSize = 65535
+        )
+            :m_uniqueName( uniqueName ),
+            m_userThreadFunc( userThreadFunc ),
+            m_lsMemoryFunc( lsMemoryFunc ),
+            m_threadStackSize( threadStackSize )
+        {
+        }
+
+        const char*     m_uniqueName;
+        ThreadFunc      m_userThreadFunc;
+        MemorySetupFunc m_lsMemoryFunc;
+        int             m_threadStackSize;
+    };
+
+    static btThreadSupportInterface* create( const ConstructionInfo& info );
+};
+
+#endif //BT_THREAD_SUPPORT_INTERFACE_H
+
diff --git a/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp b/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp
new file mode 100644
index 000000000..5521fc555
--- /dev/null
+++ b/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp
@@ -0,0 +1,369 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2018 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#if BT_THREADSAFE && !defined( _WIN32 )
+
+
+#include "LinearMath/btScalar.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "LinearMath/btThreads.h"
+#include "LinearMath/btMinMax.h"
+#include "btThreadSupportInterface.h"
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+
+
+#ifndef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 600 //for definition of pthread_barrier_t, see http://pages.cs.wisc.edu/~travitch/pthreads_primer.html
+#endif //_XOPEN_SOURCE
+#include <pthread.h>
+#include <semaphore.h>
+#include <unistd.h>   //for sysconf
+
+
+///
+/// getNumHardwareThreads()
+///
+///
+/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
+///
+#if __cplusplus >= 201103L
+
+#include <thread>
+
+int btGetNumHardwareThreads()
+{
+    return std::thread::hardware_concurrency();
+}
+
+#else
+
+int btGetNumHardwareThreads()
+{
+    return sysconf( _SC_NPROCESSORS_ONLN );
+}
+
+#endif
+
+
+// btThreadSupportPosix helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+class btThreadSupportPosix : public btThreadSupportInterface
+{
+public:
+    struct btThreadStatus
+    {
+        int m_taskId;
+        int m_commandId;
+        int m_status;
+
+        ThreadFunc m_userThreadFunc;
+        void* m_userPtr; //for taskDesc etc
+        void* m_lsMemory; //initialized using PosixLocalStoreMemorySetupFunc
+
+        pthread_t thread;
+        //each tread will wait until this signal to start its work
+        sem_t* startSemaphore;
+
+        // this is a copy of m_mainSemaphore, 
+        //each tread will signal once it is finished with its work
+        sem_t* m_mainSemaphore;
+        unsigned long threadUsed;
+    };
+private:
+    typedef unsigned long long UINT64;
+
+    btAlignedObjectArray<btThreadStatus> m_activeThreadStatus;
+    // m_mainSemaphoresemaphore will signal, if and how many threads are finished with their work
+    sem_t* m_mainSemaphore;
+    int m_numThreads;
+    UINT64 m_startedThreadsMask;
+    void startThreads( const ConstructionInfo& threadInfo );
+    void stopThreads();
+    int waitForResponse();
+
+public:
+    btThreadSupportPosix( const ConstructionInfo& threadConstructionInfo );
+    virtual ~btThreadSupportPosix();
+
+    virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; }
+    // TODO: return the number of logical processors sharing the first L3 cache
+    virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return m_numThreads + 1; }
+
+    virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE;
+    virtual void waitForAllTasks() BT_OVERRIDE;
+
+    virtual btCriticalSection* createCriticalSection() BT_OVERRIDE;
+    virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE;
+
+    virtual void* getThreadLocalMemory( int taskId ) BT_OVERRIDE
+    {
+        return m_activeThreadStatus[ taskId ].m_lsMemory;
+    }
+};
+
+
+#define checkPThreadFunction(returnValue) \
+    if(0 != returnValue) { \
+        printf("PThread problem at line %i in file %s: %i %d\n", __LINE__, __FILE__, returnValue, errno); \
+    }
+
+// The number of threads should be equal to the number of available cores
+// Todo: each worker should be linked to a single core, using SetThreadIdealProcessor.
+
+
+btThreadSupportPosix::btThreadSupportPosix( const ConstructionInfo& threadConstructionInfo )
+{
+    startThreads( threadConstructionInfo );
+}
+
+// cleanup/shutdown Libspe2
+btThreadSupportPosix::~btThreadSupportPosix()
+{
+    stopThreads();
+}
+
+#if (defined (__APPLE__))
+#define NAMED_SEMAPHORES
+#endif
+
+
+static sem_t* createSem( const char* baseName )
+{
+    static int semCount = 0;
+#ifdef NAMED_SEMAPHORES
+    /// Named semaphore begin
+    char name[ 32 ];
+    snprintf( name, 32, "/%8.s-%4.d-%4.4d", baseName, getpid(), semCount++ );
+    sem_t* tempSem = sem_open( name, O_CREAT, 0600, 0 );
+
+    if ( tempSem != reinterpret_cast<sem_t *>( SEM_FAILED ) )
+    {
+        //        printf("Created \"%s\" Semaphore %p\n", name, tempSem);
+    }
+    else
+    {
+        //printf("Error creating Semaphore %d\n", errno);
+        exit( -1 );
+    }
+    /// Named semaphore end
+#else
+    sem_t* tempSem = new sem_t;
+    checkPThreadFunction( sem_init( tempSem, 0, 0 ) );
+#endif
+    return tempSem;
+}
+
+static void destroySem( sem_t* semaphore )
+{
+#ifdef NAMED_SEMAPHORES
+    checkPThreadFunction( sem_close( semaphore ) );
+#else
+    checkPThreadFunction( sem_destroy( semaphore ) );
+    delete semaphore;
+#endif
+}
+
+static void *threadFunction( void *argument )
+{
+    btThreadSupportPosix::btThreadStatus* status = ( btThreadSupportPosix::btThreadStatus* )argument;
+
+    while ( 1 )
+    {
+        checkPThreadFunction( sem_wait( status->startSemaphore ) );
+        void* userPtr = status->m_userPtr;
+
+        if ( userPtr )
+        {
+            btAssert( status->m_status );
+            status->m_userThreadFunc( userPtr, status->m_lsMemory );
+            status->m_status = 2;
+            checkPThreadFunction( sem_post( status->m_mainSemaphore ) );
+            status->threadUsed++;
+        }
+        else
+        {
+            //exit Thread
+            status->m_status = 3;
+            checkPThreadFunction( sem_post( status->m_mainSemaphore ) );
+            printf( "Thread with taskId %i exiting\n", status->m_taskId );
+            break;
+        }
+    }
+
+    printf( "Thread TERMINATED\n" );
+}
+
+///send messages to SPUs
+void btThreadSupportPosix::runTask( int threadIndex, void* userData )
+{
+    ///we should spawn an SPU task here, and in 'waitForResponse' it should wait for response of the (one of) the first tasks that finished
+    btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ];
+    btAssert( threadIndex >= 0 );
+    btAssert( threadIndex < m_activeThreadStatus.size() );
+
+    threadStatus.m_commandId = 1;
+    threadStatus.m_status = 1;
+    threadStatus.m_userPtr = userData;
+    m_startedThreadsMask |= UINT64( 1 ) << threadIndex;
+
+    // fire event to start new task
+    checkPThreadFunction( sem_post( threadStatus.startSemaphore ) );
+}
+
+
+///check for messages from SPUs
+int btThreadSupportPosix::waitForResponse()
+{
+    ///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response
+    ///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback'
+
+    btAssert( m_activeThreadStatus.size() );
+
+    // wait for any of the threads to finish
+    checkPThreadFunction( sem_wait( m_mainSemaphore ) );
+    // get at least one thread which has finished
+    size_t last = -1;
+
+    for ( size_t t = 0; t < size_t( m_activeThreadStatus.size() ); ++t )
+    {
+        if ( 2 == m_activeThreadStatus[ t ].m_status )
+        {
+            last = t;
+            break;
+        }
+    }
+
+    btThreadStatus& threadStatus = m_activeThreadStatus[ last ];
+
+    btAssert( threadStatus.m_status > 1 );
+    threadStatus.m_status = 0;
+
+    // need to find an active spu
+    btAssert( last >= 0 );
+    m_startedThreadsMask &= ~( UINT64( 1 ) << last );
+
+    return last;
+}
+
+
+void btThreadSupportPosix::waitForAllTasks()
+{
+    while ( m_startedThreadsMask )
+    {
+        waitForResponse();
+    }
+}
+
+
+void btThreadSupportPosix::startThreads( const ConstructionInfo& threadConstructionInfo )
+{
+    m_numThreads = btGetNumHardwareThreads() - 1;  // main thread exists already
+    printf( "%s creating %i threads.\n", __FUNCTION__, m_numThreads );
+    m_activeThreadStatus.resize( m_numThreads );
+    m_startedThreadsMask = 0;
+
+    m_mainSemaphore = createSem( "main" );
+    //checkPThreadFunction(sem_wait(mainSemaphore));
+
+    for ( int i = 0; i < m_numThreads; i++ )
+    {
+        printf( "starting thread %d\n", i );
+        btThreadStatus& threadStatus = m_activeThreadStatus[ i ];
+        threadStatus.startSemaphore = createSem( "threadLocal" );
+        checkPThreadFunction( pthread_create( &threadStatus.thread, NULL, &threadFunction, (void*) &threadStatus ) );
+
+        threadStatus.m_userPtr = 0;
+        threadStatus.m_taskId = i;
+        threadStatus.m_commandId = 0;
+        threadStatus.m_status = 0;
+        threadStatus.m_mainSemaphore = m_mainSemaphore;
+        threadStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
+        threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
+        threadStatus.threadUsed = 0;
+
+        printf( "started thread %d \n", i );
+    }
+}
+
+///tell the task scheduler we are done with the SPU tasks
+void btThreadSupportPosix::stopThreads()
+{
+    for ( size_t t = 0; t < size_t( m_activeThreadStatus.size() ); ++t )
+    {
+        btThreadStatus& threadStatus = m_activeThreadStatus[ t ];
+        printf( "%s: Thread %i used: %ld\n", __FUNCTION__, int( t ), threadStatus.threadUsed );
+
+        threadStatus.m_userPtr = 0;
+        checkPThreadFunction( sem_post( threadStatus.startSemaphore ) );
+        checkPThreadFunction( sem_wait( m_mainSemaphore ) );
+
+        printf( "destroy semaphore\n" );
+        destroySem( threadStatus.startSemaphore );
+        printf( "semaphore destroyed\n" );
+        checkPThreadFunction( pthread_join( threadStatus.thread, 0 ) );
+
+    }
+    printf( "destroy main semaphore\n" );
+    destroySem( m_mainSemaphore );
+    printf( "main semaphore destroyed\n" );
+    m_activeThreadStatus.clear();
+}
+
+class btCriticalSectionPosix : public btCriticalSection
+{
+    pthread_mutex_t m_mutex;
+
+public:
+    btCriticalSectionPosix()
+    {
+        pthread_mutex_init( &m_mutex, NULL );
+    }
+    virtual ~btCriticalSectionPosix()
+    {
+        pthread_mutex_destroy( &m_mutex );
+    }
+
+    virtual void lock()
+    {
+        pthread_mutex_lock( &m_mutex );
+    }
+    virtual void unlock()
+    {
+        pthread_mutex_unlock( &m_mutex );
+    }
+};
+
+
+btCriticalSection* btThreadSupportPosix::createCriticalSection()
+{
+    return new btCriticalSectionPosix();
+}
+
+void btThreadSupportPosix::deleteCriticalSection( btCriticalSection* cs )
+{
+    delete cs;
+}
+
+
+btThreadSupportInterface* btThreadSupportInterface::create( const ConstructionInfo& info )
+{
+    return new btThreadSupportPosix( info );
+}
+
+#endif // BT_THREADSAFE && !defined( _WIN32 )
+
diff --git a/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp b/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp
new file mode 100644
index 000000000..f77616337
--- /dev/null
+++ b/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp
@@ -0,0 +1,480 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2018 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#if defined( _WIN32 ) &&  BT_THREADSAFE
+
+#include "LinearMath/btScalar.h"
+#include "LinearMath/btMinMax.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "LinearMath/btThreads.h"
+#include "btThreadSupportInterface.h"
+#include <windows.h>
+#include <stdio.h>
+
+
+struct btProcessorInfo
+{
+    int numLogicalProcessors;
+    int numCores;
+    int numNumaNodes;
+    int numL1Cache;
+    int numL2Cache;
+    int numL3Cache;
+    int numPhysicalPackages;
+    static const int maxNumTeamMasks = 32;
+    int numTeamMasks;
+    UINT64 processorTeamMasks[ maxNumTeamMasks ];
+};
+
+UINT64 getProcessorTeamMask( const btProcessorInfo& procInfo, int procId )
+{
+    UINT64 procMask = UINT64( 1 ) << procId;
+    for ( int i = 0; i < procInfo.numTeamMasks; ++i )
+    {
+        if ( procMask & procInfo.processorTeamMasks[ i ] )
+        {
+            return procInfo.processorTeamMasks[ i ];
+        }
+    }
+    return 0;
+}
+
+int getProcessorTeamIndex( const btProcessorInfo& procInfo, int procId )
+{
+    UINT64 procMask = UINT64( 1 ) << procId;
+    for ( int i = 0; i < procInfo.numTeamMasks; ++i )
+    {
+        if ( procMask & procInfo.processorTeamMasks[ i ] )
+        {
+            return i;
+        }
+    }
+    return -1;
+}
+
+int countSetBits( ULONG64 bits )
+{
+    int count = 0;
+    while ( bits )
+    {
+        if ( bits & 1 )
+        {
+            count++;
+        }
+        bits >>= 1;
+    }
+    return count;
+}
+
+
+typedef BOOL( WINAPI *Pfn_GetLogicalProcessorInformation )( PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD );
+
+
+void getProcessorInformation( btProcessorInfo* procInfo )
+{
+    memset( procInfo, 0, sizeof( *procInfo ) );
+    Pfn_GetLogicalProcessorInformation getLogicalProcInfo =
+        (Pfn_GetLogicalProcessorInformation) GetProcAddress( GetModuleHandle( TEXT( "kernel32" ) ), "GetLogicalProcessorInformation" );
+    if ( getLogicalProcInfo == NULL )
+    {
+        // no info
+        return;
+    }
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buf = NULL;
+    DWORD bufSize = 0;
+    while ( true )
+    {
+        if ( getLogicalProcInfo( buf, &bufSize ) )
+        {
+            break;
+        }
+        else
+        {
+            if ( GetLastError() == ERROR_INSUFFICIENT_BUFFER )
+            {
+                if ( buf )
+                {
+                    free( buf );
+                }
+                buf = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION) malloc( bufSize );
+            }
+        }
+    }
+
+    int len = bufSize / sizeof( *buf );
+    for ( int i = 0; i < len; ++i )
+    {
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION info = buf + i;
+        switch ( info->Relationship )
+        {
+        case RelationNumaNode:
+            procInfo->numNumaNodes++;
+            break;
+
+        case RelationProcessorCore:
+            procInfo->numCores++;
+            procInfo->numLogicalProcessors += countSetBits( info->ProcessorMask );
+            break;
+
+        case RelationCache:
+            if ( info->Cache.Level == 1 )
+            {
+                procInfo->numL1Cache++;
+            }
+            else if ( info->Cache.Level == 2 )
+            {
+                procInfo->numL2Cache++;
+            }
+            else if ( info->Cache.Level == 3 )
+            {
+                procInfo->numL3Cache++;
+                // processors that share L3 cache are considered to be on the same team
+                // because they can more easily work together on the same data.
+                // Large performance penalties will occur if 2 or more threads from different
+                // teams attempt to frequently read and modify the same cache lines.
+                //
+                // On the AMD Ryzen 7 CPU for example, the 8 cores on the CPU are split into
+                // 2 CCX units of 4 cores each. Each CCX has a separate L3 cache, so if both
+                // CCXs are operating on the same data, many cycles will be spent keeping the
+                // two caches coherent.
+                if ( procInfo->numTeamMasks < btProcessorInfo::maxNumTeamMasks )
+                {
+                    procInfo->processorTeamMasks[ procInfo->numTeamMasks ] = info->ProcessorMask;
+                    procInfo->numTeamMasks++;
+                }
+            }
+            break;
+
+        case RelationProcessorPackage:
+            procInfo->numPhysicalPackages++;
+            break;
+        }
+    }
+    free( buf );
+}
+
+
+
+///btThreadSupportWin32 helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
+class btThreadSupportWin32 : public btThreadSupportInterface
+{
+public:
+    struct btThreadStatus
+    {
+        int m_taskId;
+        int m_commandId;
+        int m_status;
+
+        ThreadFunc m_userThreadFunc;
+        void* m_userPtr; //for taskDesc etc
+        void* m_lsMemory; //initialized using Win32LocalStoreMemorySetupFunc
+
+        void* m_threadHandle; //this one is calling 'Win32ThreadFunc'
+
+        void* m_eventStartHandle;
+        char m_eventStartHandleName[ 32 ];
+
+        void* m_eventCompleteHandle;
+        char m_eventCompleteHandleName[ 32 ];
+    };
+
+private:
+    btAlignedObjectArray<btThreadStatus> m_activeThreadStatus;
+    btAlignedObjectArray<void*> m_completeHandles;
+    int m_numThreads;
+    DWORD_PTR m_startedThreadMask;
+    btProcessorInfo m_processorInfo;
+
+    void startThreads( const ConstructionInfo& threadInfo );
+    void stopThreads();
+    int waitForResponse();
+
+public:
+
+    btThreadSupportWin32( const ConstructionInfo& threadConstructionInfo );
+    virtual ~btThreadSupportWin32();
+
+    virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; }
+    virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return countSetBits(m_processorInfo.processorTeamMasks[0]); }
+
+    virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE;
+    virtual void waitForAllTasks() BT_OVERRIDE;
+
+    virtual void* getThreadLocalMemory( int taskId ) BT_OVERRIDE
+    {
+        return m_activeThreadStatus[ taskId ].m_lsMemory;
+    }
+
+    virtual btCriticalSection* createCriticalSection() BT_OVERRIDE;
+    virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE;
+};
+
+
+btThreadSupportWin32::btThreadSupportWin32( const ConstructionInfo & threadConstructionInfo )
+{
+    startThreads( threadConstructionInfo );
+}
+
+
+btThreadSupportWin32::~btThreadSupportWin32()
+{
+    stopThreads();
+}
+
+
+DWORD WINAPI win32threadStartFunc( LPVOID lpParam )
+{
+    btThreadSupportWin32::btThreadStatus* status = ( btThreadSupportWin32::btThreadStatus* )lpParam;
+
+    while ( 1 )
+    {
+        WaitForSingleObject( status->m_eventStartHandle, INFINITE );
+        void* userPtr = status->m_userPtr;
+
+        if ( userPtr )
+        {
+            btAssert( status->m_status );
+            status->m_userThreadFunc( userPtr, status->m_lsMemory );
+            status->m_status = 2;
+            SetEvent( status->m_eventCompleteHandle );
+        }
+        else
+        {
+            //exit Thread
+            status->m_status = 3;
+            printf( "Thread with taskId %i with handle %p exiting\n", status->m_taskId, status->m_threadHandle );
+            SetEvent( status->m_eventCompleteHandle );
+            break;
+        }
+    }
+    printf( "Thread TERMINATED\n" );
+    return 0;
+}
+
+
+void btThreadSupportWin32::runTask( int threadIndex, void* userData )
+{
+    btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ];
+    btAssert( taskId >= 0 );
+    btAssert( int( taskId ) < m_activeThreadStatus.size() );
+
+    threadStatus.m_commandId = 1;
+    threadStatus.m_status = 1;
+    threadStatus.m_userPtr = userData;
+    m_startedThreadMask |= DWORD_PTR( 1 ) << threadIndex;
+
+    ///fire event to start new task
+    SetEvent( threadStatus.m_eventStartHandle );
+}
+
+
+int btThreadSupportWin32::waitForResponse()
+{
+    btAssert( m_activeThreadStatus.size() );
+
+    int last = -1;
+    DWORD res = WaitForMultipleObjects( m_completeHandles.size(), &m_completeHandles[ 0 ], FALSE, INFINITE );
+    btAssert( res != WAIT_FAILED );
+    last = res - WAIT_OBJECT_0;
+
+    btThreadStatus& threadStatus = m_activeThreadStatus[ last ];
+    btAssert( threadStatus.m_threadHandle );
+    btAssert( threadStatus.m_eventCompleteHandle );
+
+    //WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE);
+    btAssert( threadStatus.m_status > 1 );
+    threadStatus.m_status = 0;
+
+    ///need to find an active spu
+    btAssert( last >= 0 );
+    m_startedThreadMask &= ~( DWORD_PTR( 1 ) << last );
+
+    return last;
+}
+
+
+void btThreadSupportWin32::waitForAllTasks()
+{
+    while ( m_startedThreadMask )
+    {
+        waitForResponse();
+    }
+}
+
+
+void btThreadSupportWin32::startThreads( const ConstructionInfo& threadConstructionInfo )
+{
+    static int uniqueId = 0;
+    uniqueId++;
+    btProcessorInfo& procInfo = m_processorInfo;
+    getProcessorInformation( &procInfo );
+    DWORD_PTR dwProcessAffinityMask = 0;
+    DWORD_PTR dwSystemAffinityMask = 0;
+    if ( !GetProcessAffinityMask( GetCurrentProcess(), &dwProcessAffinityMask, &dwSystemAffinityMask ) )
+    {
+        dwProcessAffinityMask = 0;
+    }
+    ///The number of threads should be equal to the number of available cores - 1
+    m_numThreads = btMin(procInfo.numLogicalProcessors, int(BT_MAX_THREAD_COUNT)) - 1; // cap to max thread count (-1 because main thread already exists)
+
+    m_activeThreadStatus.resize( m_numThreads );
+    m_completeHandles.resize( m_numThreads );
+    m_startedThreadMask = 0;
+
+    // set main thread affinity
+    if ( DWORD_PTR mask = dwProcessAffinityMask & getProcessorTeamMask( procInfo, 0 ))
+    {
+        SetThreadAffinityMask( GetCurrentThread(), mask );
+        SetThreadIdealProcessor( GetCurrentThread(), 0 );
+    }
+
+    for ( int i = 0; i < m_numThreads; i++ )
+    {
+        printf( "starting thread %d\n", i );
+
+        btThreadStatus& threadStatus = m_activeThreadStatus[ i ];
+
+        LPSECURITY_ATTRIBUTES lpThreadAttributes = NULL;
+        SIZE_T dwStackSize = threadConstructionInfo.m_threadStackSize;
+        LPTHREAD_START_ROUTINE lpStartAddress = &win32threadStartFunc;
+        LPVOID lpParameter = &threadStatus;
+        DWORD dwCreationFlags = 0;
+        LPDWORD lpThreadId = 0;
+
+        threadStatus.m_userPtr = 0;
+
+        sprintf( threadStatus.m_eventStartHandleName, "es%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i );
+        threadStatus.m_eventStartHandle = CreateEventA( 0, false, false, threadStatus.m_eventStartHandleName );
+
+        sprintf( threadStatus.m_eventCompleteHandleName, "ec%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i );
+        threadStatus.m_eventCompleteHandle = CreateEventA( 0, false, false, threadStatus.m_eventCompleteHandleName );
+
+        m_completeHandles[ i ] = threadStatus.m_eventCompleteHandle;
+
+        HANDLE handle = CreateThread( lpThreadAttributes, dwStackSize, lpStartAddress, lpParameter, dwCreationFlags, lpThreadId );
+        //SetThreadPriority( handle, THREAD_PRIORITY_HIGHEST );
+        // highest priority -- can cause erratic performance when numThreads > numCores
+        //                     we don't want worker threads to be higher priority than the main thread or the main thread could get
+        //                     totally shut out and unable to tell the workers to stop
+        //SetThreadPriority( handle, THREAD_PRIORITY_BELOW_NORMAL );
+
+        {
+            int processorId = i + 1;  // leave processor 0 for main thread
+            DWORD_PTR teamMask = getProcessorTeamMask( procInfo, processorId );
+            if ( teamMask )
+            {
+                // bind each thread to only execute on processors of it's assigned team
+                //  - for single-socket Intel x86 CPUs this has no effect (only a single, shared L3 cache so there is only 1 team)
+                //  - for multi-socket Intel this will keep threads from migrating from one socket to another
+                //  - for AMD Ryzen this will keep threads from migrating from one CCX to another
+                DWORD_PTR mask = teamMask & dwProcessAffinityMask;
+                if ( mask )
+                {
+                    SetThreadAffinityMask( handle, mask );
+                }
+            }
+            SetThreadIdealProcessor( handle, processorId );
+        }
+
+        threadStatus.m_taskId = i;
+        threadStatus.m_commandId = 0;
+        threadStatus.m_status = 0;
+        threadStatus.m_threadHandle = handle;
+        threadStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
+        threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
+
+        printf( "started %s thread %d with threadHandle %p\n", threadConstructionInfo.m_uniqueName, i, handle );
+    }
+}
+
+///tell the task scheduler we are done with the SPU tasks
+void btThreadSupportWin32::stopThreads()
+{
+    for ( int i = 0; i < m_activeThreadStatus.size(); i++ )
+    {
+        btThreadStatus& threadStatus = m_activeThreadStatus[ i ];
+        if ( threadStatus.m_status > 0 )
+        {
+            WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE );
+        }
+
+        delete threadStatus.m_lsMemory;
+
+        threadStatus.m_userPtr = 0;
+        SetEvent( threadStatus.m_eventStartHandle );
+        WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE );
+
+        CloseHandle( threadStatus.m_eventCompleteHandle );
+        CloseHandle( threadStatus.m_eventStartHandle );
+        CloseHandle( threadStatus.m_threadHandle );
+
+    }
+
+    m_activeThreadStatus.clear();
+    m_completeHandles.clear();
+}
+
+
+class btWin32CriticalSection : public btCriticalSection
+{
+private:
+    CRITICAL_SECTION mCriticalSection;
+
+public:
+    btWin32CriticalSection()
+    {
+        InitializeCriticalSection( &mCriticalSection );
+    }
+
+    ~btWin32CriticalSection()
+    {
+        DeleteCriticalSection( &mCriticalSection );
+    }
+
+    void lock()
+    {
+        EnterCriticalSection( &mCriticalSection );
+    }
+
+    void unlock()
+    {
+        LeaveCriticalSection( &mCriticalSection );
+    }
+};
+
+
+btCriticalSection* btThreadSupportWin32::createCriticalSection()
+{
+    unsigned char* mem = (unsigned char*) btAlignedAlloc( sizeof( btWin32CriticalSection ), 16 );
+    btWin32CriticalSection* cs = new( mem ) btWin32CriticalSection();
+    return cs;
+}
+
+void btThreadSupportWin32::deleteCriticalSection( btCriticalSection* criticalSection )
+{
+    criticalSection->~btCriticalSection();
+    btAlignedFree( criticalSection );
+}
+
+
+btThreadSupportInterface* btThreadSupportInterface::create( const ConstructionInfo& info )
+{
+    return new btThreadSupportWin32( info );
+}
+
+
+
+#endif //defined(_WIN32) && BT_THREADSAFE
+
diff --git a/src/LinearMath/btThreads.cpp b/src/LinearMath/btThreads.cpp
index 59a7ea36e..c037626ff 100644
--- a/src/LinearMath/btThreads.cpp
+++ b/src/LinearMath/btThreads.cpp
@@ -453,6 +453,33 @@ void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBod
 #endif// #if BT_THREADSAFE
 }
 
+btScalar btParallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body )
+{
+#if BT_THREADSAFE
+
+#if BT_DETECT_BAD_THREAD_INDEX
+    if ( !btThreadsAreRunning() )
+    {
+        // clear out thread ids
+        for ( int i = 0; i < BT_MAX_THREAD_COUNT; ++i )
+        {
+            gDebugThreadIds[ i ] = kInvalidThreadId;
+        }
+    }
+#endif // #if BT_DETECT_BAD_THREAD_INDEX
+
+    btAssert( gBtTaskScheduler != NULL );  // call btSetTaskScheduler() with a valid task scheduler first!
+    return gBtTaskScheduler->parallelSum( iBegin, iEnd, grainSize, body );
+
+#else // #if BT_THREADSAFE
+
+    // non-parallel version of btParallelSum
+    btAssert( !"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE" );
+    return body.sumLoop( iBegin, iEnd );
+
+#endif //#else // #if BT_THREADSAFE
+}
+
 
 ///
 /// btTaskSchedulerSequential -- non-threaded implementation of task scheduler
@@ -470,6 +497,11 @@ public:
         BT_PROFILE( "parallelFor_sequential" );
         body.forLoop( iBegin, iEnd );
     }
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelSum_sequential" );
+        return body.sumLoop( iBegin, iEnd );
+    }
 };
 
 
@@ -514,11 +546,25 @@ public:
 #pragma omp parallel for schedule( static, 1 )
         for ( int i = iBegin; i < iEnd; i += grainSize )
         {
-            BT_PROFILE( "OpenMP_job" );
+            BT_PROFILE( "OpenMP_forJob" );
             body.forLoop( i, ( std::min )( i + grainSize, iEnd ) );
         }
         btPopThreadsAreRunning();
     }
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelFor_OpenMP" );
+        btPushThreadsAreRunning();
+        btScalar sum = btScalar( 0 );
+#pragma omp parallel for schedule( static, 1 ) reduction(+:sum)
+        for ( int i = iBegin; i < iEnd; i += grainSize )
+        {
+            BT_PROFILE( "OpenMP_sumJob" );
+            sum += body.sumLoop( i, ( std::min )( i + grainSize, iEnd ) );
+        }
+        btPopThreadsAreRunning();
+        return sum;
+    }
 };
 #endif // #if BT_USE_OPENMP && BT_THREADSAFE
 
@@ -571,22 +617,21 @@ public:
             btResetThreadIndexCounter();
         }
     }
-    struct BodyAdapter
+    struct ForBodyAdapter
     {
         const btIParallelForBody* mBody;
 
+        ForBodyAdapter( const btIParallelForBody* body ) : mBody( body ) {}
         void operator()( const tbb::blocked_range<int>& range ) const
         {
-            BT_PROFILE( "TBB_job" );
+            BT_PROFILE( "TBB_forJob" );
             mBody->forLoop( range.begin(), range.end() );
         }
     };
     virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
     {
         BT_PROFILE( "parallelFor_TBB" );
-        // TBB dispatch
-        BodyAdapter tbbBody;
-        tbbBody.mBody = &body;
+        ForBodyAdapter tbbBody( &body );
         btPushThreadsAreRunning();
         tbb::parallel_for( tbb::blocked_range<int>( iBegin, iEnd, grainSize ),
             tbbBody,
@@ -594,6 +639,29 @@ public:
         );
         btPopThreadsAreRunning();
     }
+    struct SumBodyAdapter
+    {
+        const btIParallelSumBody* mBody;
+        btScalar mSum;
+
+        SumBodyAdapter( const btIParallelSumBody* body ) : mBody( body ), mSum( btScalar( 0 ) ) {}
+        SumBodyAdapter( const SumBodyAdapter& src, tbb::split ) : mBody( src.mBody ), mSum( btScalar( 0 ) ) {}
+        void join( const SumBodyAdapter& src ) { mSum += src.mSum; }
+        void operator()( const tbb::blocked_range<int>& range )
+        {
+            BT_PROFILE( "TBB_sumJob" );
+            mSum += mBody->sumLoop( range.begin(), range.end() );
+        }
+    };
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelSum_TBB" );
+        SumBodyAdapter tbbBody( &body );
+        btPushThreadsAreRunning();
+        tbb::parallel_deterministic_reduce( tbb::blocked_range<int>( iBegin, iEnd, grainSize ), tbbBody );
+        btPopThreadsAreRunning();
+        return tbbBody.mSum;
+    }
 };
 #endif // #if BT_USE_TBB && BT_THREADSAFE
 
@@ -605,6 +673,7 @@ public:
 class btTaskSchedulerPPL : public btITaskScheduler
 {
     int m_numThreads;
+    concurrency::combinable<btScalar> m_sum;  // for parallelSum
 public:
     btTaskSchedulerPPL() : btITaskScheduler( "PPL" )
     {
@@ -644,15 +713,16 @@ public:
             btResetThreadIndexCounter();
         }
     }
-    struct BodyAdapter
+    struct ForBodyAdapter
     {
         const btIParallelForBody* mBody;
         int mGrainSize;
         int mIndexEnd;
 
+        ForBodyAdapter( const btIParallelForBody* body, int grainSize, int end ) : mBody( body ), mGrainSize( grainSize ), mIndexEnd( end ) {}
         void operator()( int i ) const
         {
-            BT_PROFILE( "PPL_job" );
+            BT_PROFILE( "PPL_forJob" );
             mBody->forLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) );
         }
     };
@@ -660,10 +730,7 @@ public:
     {
         BT_PROFILE( "parallelFor_PPL" );
         // PPL dispatch
-        BodyAdapter pplBody;
-        pplBody.mBody = &body;
-        pplBody.mGrainSize = grainSize;
-        pplBody.mIndexEnd = iEnd;
+        ForBodyAdapter pplBody( &body, grainSize, iEnd );
         btPushThreadsAreRunning();
         // note: MSVC 2010 doesn't support partitioner args, so avoid them
         concurrency::parallel_for( iBegin,
@@ -673,6 +740,36 @@ public:
         );
         btPopThreadsAreRunning();
     }
+    struct SumBodyAdapter
+    {
+        const btIParallelSumBody* mBody;
+        concurrency::combinable<btScalar>* mSum;
+        int mGrainSize;
+        int mIndexEnd;
+
+        SumBodyAdapter( const btIParallelSumBody* body, concurrency::combinable<btScalar>* sum, int grainSize, int end ) : mBody( body ), mSum(sum), mGrainSize( grainSize ), mIndexEnd( end ) {}
+        void operator()( int i ) const
+        {
+            BT_PROFILE( "PPL_sumJob" );
+            mSum->local() += mBody->sumLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) );
+        }
+    };
+    static btScalar sumFunc( btScalar a, btScalar b ) { return a + b; }
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelSum_PPL" );
+        m_sum.clear();
+        SumBodyAdapter pplBody( &body, &m_sum, grainSize, iEnd );
+        btPushThreadsAreRunning();
+        // note: MSVC 2010 doesn't support partitioner args, so avoid them
+        concurrency::parallel_for( iBegin,
+            iEnd,
+            grainSize,
+            pplBody
+        );
+        btPopThreadsAreRunning();
+        return m_sum.combine( sumFunc );
+    }
 };
 #endif // #if BT_USE_PPL && BT_THREADSAFE
 
diff --git a/src/LinearMath/btThreads.h b/src/LinearMath/btThreads.h
index cef542329..ecd5a19cf 100644
--- a/src/LinearMath/btThreads.h
+++ b/src/LinearMath/btThreads.h
@@ -107,6 +107,17 @@ public:
     virtual void forLoop( int iBegin, int iEnd ) const = 0;
 };
 
+//
+// btIParallelSumBody -- subclass this to express work that can be done in parallel
+//                       and produces a sum over all loop elements
+//
+class btIParallelSumBody
+{
+public:
+    virtual ~btIParallelSumBody() {}
+    virtual btScalar sumLoop( int iBegin, int iEnd ) const = 0;
+};
+
 //
 // btITaskScheduler -- subclass this to implement a task scheduler that can dispatch work to
 //                     worker threads
@@ -122,6 +133,8 @@ public:
     virtual int getNumThreads() const = 0;
     virtual void setNumThreads( int numThreads ) = 0;
     virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) = 0;
+    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) = 0;
+    virtual void sleepWorkerThreadsHint() {}  // hint the task scheduler that we may not be using these threads for a little while
 
     // internal use only
     virtual void activate();
@@ -143,6 +156,9 @@ btITaskScheduler* btGetTaskScheduler();
 // get non-threaded task scheduler (always available)
 btITaskScheduler* btGetSequentialTaskScheduler();
 
+// create a default task scheduler (Win32 or pthreads based)
+btITaskScheduler* btCreateDefaultTaskScheduler();
+
 // get OpenMP task scheduler (if available, otherwise returns null)
 btITaskScheduler* btGetOpenMPTaskScheduler();
 
@@ -156,5 +172,9 @@ btITaskScheduler* btGetPPLTaskScheduler();
 //                 (iterations may be done out of order, so no dependencies are allowed)
 void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body );
 
+// btParallelSum -- call this to dispatch work like a for-loop, returns the sum of all iterations
+//                 (iterations may be done out of order, so no dependencies are allowed)
+btScalar btParallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body );
+
 
 #endif

From 06690e542b22b08c2c120d110ba8225211afb5ce Mon Sep 17 00:00:00 2001
From: Lunkhound <lunkhound@gmail.com>
Date: Tue, 27 Feb 2018 00:30:45 -0800
Subject: [PATCH 2/8] fix compile error for GCC

---
 src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
index bc840e889..a27350bf9 100644
--- a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
+++ b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
@@ -1113,7 +1113,7 @@ void btBatchedConstraints::setup(
 {
     if (constraints->size() >= minBatchSize*4)
     {
-        bool use2DGrid = batchingMethod == BatchingMethod::BATCHING_METHOD_SPATIAL_GRID_2D;
+        bool use2DGrid = batchingMethod == BATCHING_METHOD_SPATIAL_GRID_2D;
         setupSpatialGridBatchesMt( this, scratchMemory, constraints, bodies, minBatchSize, maxBatchSize, use2DGrid );
         if (s_debugDrawBatches)
         {

From d900a749392465e6bbadfdaa98635abcf30b30f4 Mon Sep 17 00:00:00 2001
From: Lunkhound <lunkhound@gmail.com>
Date: Tue, 27 Feb 2018 03:03:12 -0800
Subject: [PATCH 3/8] add new source files to setup.py to fix travis-ci build

---
 setup.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 664059460..bdf38bf27 100644
--- a/setup.py
+++ b/setup.py
@@ -134,6 +134,9 @@ sources = ["examples/pybullet/pybullet.c"]\
 +["src/LinearMath/btConvexHullComputer.cpp"]\
 +["src/LinearMath/btQuickprof.cpp"]\
 +["src/LinearMath/btThreads.cpp"]\
++["src/LinearMath/TaskScheduler/btTaskScheduler.cpp"]\
++["src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp"]\
++["src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp"]\
 +["src/BulletCollision/BroadphaseCollision/btAxisSweep3.cpp"]\
 +["src/BulletCollision/BroadphaseCollision/btDbvt.cpp"]\
 +["src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp"]\
@@ -233,6 +236,7 @@ sources = ["examples/pybullet/pybullet.c"]\
 +["src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp"]\
 +["src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp"]\
 +["src/BulletDynamics/Dynamics/btSimpleDynamicsWorld.cpp"]\
++["src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp"]\
 +["src/BulletDynamics/ConstraintSolver/btConeTwistConstraint.cpp"]\
 +["src/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.cpp"]\
 +["src/BulletDynamics/ConstraintSolver/btSliderConstraint.cpp"]\
@@ -249,6 +253,7 @@ sources = ["examples/pybullet/pybullet.c"]\
 +["src/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.cpp"]\
 +["src/BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.cpp"]\
 +["src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp"]\
++["src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp"]\
 +["src/BulletDynamics/MLCPSolvers/btDantzigLCP.cpp"]\
 +["src/BulletDynamics/MLCPSolvers/btLemkeAlgorithm.cpp"]\
 +["src/BulletDynamics/MLCPSolvers/btMLCPSolver.cpp"]\
@@ -479,4 +484,3 @@ setup(
     packages=[x for x in find_packages('examples/pybullet/gym')],
     package_data = { 'pybullet_data': need_files }
 )
-

From 45fd4acf6eb69389f9a600903a7b7bf8e813815f Mon Sep 17 00:00:00 2001
From: Lunkhound <lunkhound@gmail.com>
Date: Mon, 5 Mar 2018 20:05:38 -0800
Subject: [PATCH 4/8] dynamicsWorldMt: make island mgr aware of whether a
 parallel solver is present and make handoff from parallel solver to
 solver-pool more explicit

---
 .../CommonRigidBodyMTBase.cpp                 |  28 ++++-
 .../Dynamics/btDiscreteDynamicsWorldMt.cpp    |  88 +++-----------
 .../Dynamics/btDiscreteDynamicsWorldMt.h      |   4 +-
 .../Dynamics/btSimulationIslandManagerMt.cpp  | 110 ++++++++++--------
 .../Dynamics/btSimulationIslandManagerMt.h    |  36 +++---
 5 files changed, 116 insertions(+), 150 deletions(-)

diff --git a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
index b11cd7691..f9e0c209a 100644
--- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
+++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
@@ -200,11 +200,11 @@ public:
 ///
 /// myParallelIslandDispatch -- wrap default parallel dispatch for profiling and to get the number of simulation islands
 //
-void myParallelIslandDispatch( btAlignedObjectArray<btSimulationIslandManagerMt::Island*>* islandsPtr, btSimulationIslandManagerMt::IslandCallback* callback )
+void myParallelIslandDispatch( btAlignedObjectArray<btSimulationIslandManagerMt::Island*>* islandsPtr, const btSimulationIslandManagerMt::SolverParams& solverParams)
 {
     ProfileHelper prof( Profiler::kRecordDispatchIslands );
     gNumIslands = islandsPtr->size();
-    btSimulationIslandManagerMt::parallelIslandDispatch( islandsPtr, callback );
+    btSimulationIslandManagerMt::parallelIslandDispatch( islandsPtr, solverParams );
 }
 
 
@@ -239,9 +239,10 @@ public:
     MyDiscreteDynamicsWorld( btDispatcher* dispatcher,
                              btBroadphaseInterface* pairCache,
                              btConstraintSolverPoolMt* constraintSolver,
+                             btSequentialImpulseConstraintSolverMt* constraintSolverMt,
                              btCollisionConfiguration* collisionConfiguration
                              ) :
-                             btDiscreteDynamicsWorldMt( dispatcher, pairCache, constraintSolver, collisionConfiguration )
+                             btDiscreteDynamicsWorldMt( dispatcher, pairCache, constraintSolver, constraintSolverMt, collisionConfiguration )
     {
         btSimulationIslandManagerMt* islandMgr = static_cast<btSimulationIslandManagerMt*>( m_islandManager );
         islandMgr->setIslandDispatchFunction( myParallelIslandDispatch );
@@ -347,11 +348,12 @@ static btTaskSchedulerManager gTaskSchedulerMgr;
 #if BT_THREADSAFE
 static bool gMultithreadedWorld = true;
 static bool gDisplayProfileInfo = true;
+static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT;
 #else
 static bool gMultithreadedWorld = false;
 static bool gDisplayProfileInfo = false;
+static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE;
 #endif
-static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT;
 static int gSolverMode = SOLVER_SIMD |
                         SOLVER_USE_WARMSTARTING |
                         // SOLVER_RANDMIZE_ORDER |
@@ -547,16 +549,28 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
 
         btConstraintSolverPoolMt* solverPool;
         {
+            SolverType poolSolverType = m_solverType;
+            if (poolSolverType == SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT)
+            {
+                // pool solvers shouldn't be parallel solvers, we don't allow that kind of
+                // nested parallelism because of performance issues
+                poolSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE;
+            }
             btConstraintSolver* solvers[ BT_MAX_THREAD_COUNT ];
             int maxThreadCount = BT_MAX_THREAD_COUNT;
             for ( int i = 0; i < maxThreadCount; ++i )
             {
-                solvers[ i ] = createSolverByType( m_solverType );
+                solvers[ i ] = createSolverByType( poolSolverType );
             }
             solverPool = new btConstraintSolverPoolMt( solvers, maxThreadCount );
             m_solver = solverPool;
         }
-        btDiscreteDynamicsWorld* world = new MyDiscreteDynamicsWorld( m_dispatcher, m_broadphase, solverPool, m_collisionConfiguration );
+        btSequentialImpulseConstraintSolverMt* solverMt = NULL;
+        if ( m_solverType == SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT )
+        {
+            solverMt = new MySequentialImpulseConstraintSolverMt();
+        }
+        btDiscreteDynamicsWorld* world = new MyDiscreteDynamicsWorld( m_dispatcher, m_broadphase, solverPool, solverMt, m_collisionConfiguration );
         m_dynamicsWorld = world;
         m_multithreadedWorld = true;
         btAssert( btGetTaskScheduler() != NULL );
@@ -579,6 +593,8 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
         SolverType solverType = m_solverType;
         if ( solverType == SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT )
         {
+            // using the parallel solver with the single-threaded world works, but is
+            // disabled here to avoid confusion
             solverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE;
         }
         m_solver = createSolverByType( solverType );
diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
index 330bccb87..d705bf238 100644
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
@@ -50,63 +50,6 @@ subject to the following restrictions:
 #include "LinearMath/btSerializer.h"
 
 
-struct InplaceSolverIslandCallbackMt : public btSimulationIslandManagerMt::IslandCallback
-{
-	btContactSolverInfo*	m_solverInfo;
-	btConstraintSolver*		m_solver;
-	btIDebugDraw*			m_debugDrawer;
-	btDispatcher*			m_dispatcher;
-
-	InplaceSolverIslandCallbackMt(
-		btConstraintSolver*	solver,
-		btStackAlloc* stackAlloc,
-		btDispatcher* dispatcher)
-		:m_solverInfo(NULL),
-		m_solver(solver),
-		m_debugDrawer(NULL),
-		m_dispatcher(dispatcher)
-	{
-
-	}
-
-	InplaceSolverIslandCallbackMt& operator=(InplaceSolverIslandCallbackMt& other)
-	{
-		btAssert(0);
-		(void)other;
-		return *this;
-	}
-
-	SIMD_FORCE_INLINE void setup ( btContactSolverInfo* solverInfo, btIDebugDraw* debugDrawer)
-	{
-		btAssert(solverInfo);
-		m_solverInfo = solverInfo;
-		m_debugDrawer = debugDrawer;
-	}
-
-
-	virtual	void	processIsland( btCollisionObject** bodies,
-                                   int numBodies,
-                                   btPersistentManifold** manifolds,
-                                   int numManifolds,
-                                   btTypedConstraint** constraints,
-                                   int numConstraints,
-                                   int islandId
-                                   )
-	{
-        m_solver->solveGroup( bodies,
-                              numBodies,
-                              manifolds,
-                              numManifolds,
-                              constraints,
-                              numConstraints,
-                              *m_solverInfo,
-                              m_debugDrawer,
-                              m_dispatcher
-                              );
-    }
-
-};
-
 
 ///
 /// btConstraintSolverPoolMt
@@ -209,7 +152,12 @@ void btConstraintSolverPoolMt::reset()
 /// btDiscreteDynamicsWorldMt
 ///
 
-btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher, btBroadphaseInterface* pairCache, btConstraintSolverPoolMt* constraintSolver, btCollisionConfiguration* collisionConfiguration)
+btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,
+    btBroadphaseInterface* pairCache,
+    btConstraintSolverPoolMt* constraintSolver,
+    btConstraintSolver* constraintSolverMt,
+    btCollisionConfiguration* collisionConfiguration
+)
 : btDiscreteDynamicsWorld(dispatcher,pairCache,constraintSolver,collisionConfiguration)
 {
 	if (m_ownsIslandManager)
@@ -217,31 +165,18 @@ btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher, b
 		m_islandManager->~btSimulationIslandManager();
 		btAlignedFree( m_islandManager);
 	}
-    {
-		void* mem = btAlignedAlloc(sizeof(InplaceSolverIslandCallbackMt),16);
-		m_solverIslandCallbackMt = new (mem) InplaceSolverIslandCallbackMt (m_constraintSolver, 0, dispatcher);
-    }
 	{
 		void* mem = btAlignedAlloc(sizeof(btSimulationIslandManagerMt),16);
 		btSimulationIslandManagerMt* im = new (mem) btSimulationIslandManagerMt();
         im->setMinimumSolverBatchSize( m_solverInfo.m_minimumSolverBatchSize );
         m_islandManager = im;
 	}
+    m_constraintSolverMt = constraintSolverMt;
 }
 
 
 btDiscreteDynamicsWorldMt::~btDiscreteDynamicsWorldMt()
 {
-	if (m_solverIslandCallbackMt)
-	{
-		m_solverIslandCallbackMt->~InplaceSolverIslandCallbackMt();
-		btAlignedFree(m_solverIslandCallbackMt);
-	}
-	if (m_ownsConstraintSolver)
-	{
-		m_constraintSolver->~btConstraintSolver();
-		btAlignedFree(m_constraintSolver);
-	}
 }
 
 
@@ -249,12 +184,17 @@ void btDiscreteDynamicsWorldMt::solveConstraints(btContactSolverInfo& solverInfo
 {
 	BT_PROFILE("solveConstraints");
 
-	m_solverIslandCallbackMt->setup(&solverInfo, getDebugDrawer());
 	m_constraintSolver->prepareSolve(getCollisionWorld()->getNumCollisionObjects(), getCollisionWorld()->getDispatcher()->getNumManifolds());
 
 	/// solve all the constraints for this island
     btSimulationIslandManagerMt* im = static_cast<btSimulationIslandManagerMt*>(m_islandManager);
-    im->buildAndProcessIslands( getCollisionWorld()->getDispatcher(), getCollisionWorld(), m_constraints, m_solverIslandCallbackMt );
+    btSimulationIslandManagerMt::SolverParams solverParams;
+    solverParams.m_solverPool = m_constraintSolver;
+    solverParams.m_solverMt = m_constraintSolverMt;
+    solverParams.m_solverInfo = &solverInfo;
+    solverParams.m_debugDrawer = m_debugDrawer;
+    solverParams.m_dispatcher = getCollisionWorld()->getDispatcher();
+    im->buildAndProcessIslands( getCollisionWorld()->getDispatcher(), getCollisionWorld(), m_constraints, solverParams );
 
 	m_constraintSolver->allSolved(solverInfo, m_debugDrawer);
 }
diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
index a8cc22dd0..667fe5800 100644
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
@@ -21,7 +21,6 @@ subject to the following restrictions:
 #include "btSimulationIslandManagerMt.h"
 #include "BulletDynamics/ConstraintSolver/btConstraintSolver.h"
 
-struct InplaceSolverIslandCallbackMt;
 
 ///
 /// btConstraintSolverPoolMt - masquerades as a constraint solver, but really it is a threadsafe pool of them.
@@ -88,7 +87,7 @@ private:
 ATTRIBUTE_ALIGNED16(class) btDiscreteDynamicsWorldMt : public btDiscreteDynamicsWorld
 {
 protected:
-    InplaceSolverIslandCallbackMt* m_solverIslandCallbackMt;
+    btConstraintSolver* m_constraintSolverMt;
 
     virtual void solveConstraints(btContactSolverInfo& solverInfo) BT_OVERRIDE;
 
@@ -126,6 +125,7 @@ public:
 	btDiscreteDynamicsWorldMt(btDispatcher* dispatcher,
         btBroadphaseInterface* pairCache,
         btConstraintSolverPoolMt* constraintSolver,   // Note this should be a solver-pool for multi-threading
+        btConstraintSolver* constraintSolverMt,    // single multi-threaded solver for large islands (or NULL)
         btCollisionConfiguration* collisionConfiguration
     );
 	virtual ~btDiscreteDynamicsWorldMt();
diff --git a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
index 54ac39aaf..fc54f0ba6 100644
--- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
+++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
@@ -276,7 +276,7 @@ btSimulationIslandManagerMt::Island* btSimulationIslandManagerMt::allocateIsland
 void btSimulationIslandManagerMt::buildIslands( btDispatcher* dispatcher, btCollisionWorld* collisionWorld )
 {
 
-	BT_PROFILE("islandUnionFindAndQuickSort");
+	BT_PROFILE("buildIslands");
 	
 	btCollisionObjectArray& collisionObjects = collisionWorld->getCollisionObjectArray();
 
@@ -545,53 +545,58 @@ void btSimulationIslandManagerMt::mergeIslands()
 }
 
 
-void btSimulationIslandManagerMt::serialIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback )
+void btSimulationIslandManagerMt::solveIsland(btConstraintSolver* solver, Island& island, const SolverParams& solverParams)
+{
+    btPersistentManifold** manifolds = island.manifoldArray.size() ? &island.manifoldArray[ 0 ] : NULL;
+    btTypedConstraint** constraintsPtr = island.constraintArray.size() ? &island.constraintArray[ 0 ] : NULL;
+    solver->solveGroup( &island.bodyArray[ 0 ],
+        island.bodyArray.size(),
+        manifolds,
+        island.manifoldArray.size(),
+        constraintsPtr,
+        island.constraintArray.size(),
+        *solverParams.m_solverInfo,
+        solverParams.m_debugDrawer,
+        solverParams.m_dispatcher
+    );
+}
+
+
+void btSimulationIslandManagerMt::serialIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, const SolverParams& solverParams )
 {
     BT_PROFILE( "serialIslandDispatch" );
     // serial dispatch
     btAlignedObjectArray<Island*>& islands = *islandsPtr;
+    btConstraintSolver* solver = solverParams.m_solverMt ? solverParams.m_solverMt : solverParams.m_solverPool;
     for ( int i = 0; i < islands.size(); ++i )
     {
-        Island* island = islands[ i ];
-        btPersistentManifold** manifolds = island->manifoldArray.size() ? &island->manifoldArray[ 0 ] : NULL;
-        btTypedConstraint** constraintsPtr = island->constraintArray.size() ? &island->constraintArray[ 0 ] : NULL;
-        callback->processIsland( &island->bodyArray[ 0 ],
-                                 island->bodyArray.size(),
-                                 manifolds,
-                                 island->manifoldArray.size(),
-                                 constraintsPtr,
-                                 island->constraintArray.size(),
-                                 island->id
-                                 );
+        solveIsland(solver, *islands[ i ], solverParams);
     }
 }
 
+
 struct UpdateIslandDispatcher : public btIParallelForBody
 {
-    btAlignedObjectArray<btSimulationIslandManagerMt::Island*>* islandsPtr;
-    btSimulationIslandManagerMt::IslandCallback* callback;
+    btAlignedObjectArray<btSimulationIslandManagerMt::Island*>& m_islandsPtr;
+    const btSimulationIslandManagerMt::SolverParams& m_solverParams;
+
+    UpdateIslandDispatcher(btAlignedObjectArray<btSimulationIslandManagerMt::Island*>& islandsPtr, const btSimulationIslandManagerMt::SolverParams& solverParams)
+        : m_islandsPtr(islandsPtr), m_solverParams(solverParams)
+    {}
 
     void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE
     {
+        btConstraintSolver* solver = m_solverParams.m_solverPool;
         for ( int i = iBegin; i < iEnd; ++i )
         {
-            btSimulationIslandManagerMt::Island* island = ( *islandsPtr )[ i ];
-            btPersistentManifold** manifolds = island->manifoldArray.size() ? &island->manifoldArray[ 0 ] : NULL;
-            btTypedConstraint** constraintsPtr = island->constraintArray.size() ? &island->constraintArray[ 0 ] : NULL;
-            callback->processIsland( &island->bodyArray[ 0 ],
-                island->bodyArray.size(),
-                manifolds,
-                island->manifoldArray.size(),
-                constraintsPtr,
-                island->constraintArray.size(),
-                island->id
-            );
+            btSimulationIslandManagerMt::Island* island = m_islandsPtr[ i ];
+            btSimulationIslandManagerMt::solveIsland( solver, *island, m_solverParams );
         }
     }
 };
 
 
-void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback )
+void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, const SolverParams& solverParams )
 {
     BT_PROFILE( "parallelIslandDispatch" );
     //
@@ -617,24 +622,25 @@ void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray<I
     // any gains from parallelism.
     //
 
-    UpdateIslandDispatcher dispatcher;
-    dispatcher.islandsPtr = islandsPtr;
-    dispatcher.callback = callback;
+    UpdateIslandDispatcher dispatcher(*islandsPtr, solverParams);
     // We take advantage of the fact the islands are sorted in order of decreasing size
     int iBegin = 0;
-    while (iBegin < islandsPtr->size())
+    if (solverParams.m_solverMt)
     {
-        btSimulationIslandManagerMt::Island* island = (*islandsPtr)[ iBegin ];
-        if (island->manifoldArray.size() < btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching)
+        while ( iBegin < islandsPtr->size() )
         {
-            // OK to submit the rest of the array in parallel
-            break;
+            btSimulationIslandManagerMt::Island* island = ( *islandsPtr )[ iBegin ];
+            if ( island->manifoldArray.size() < btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching )
+            {
+                // OK to submit the rest of the array in parallel
+                break;
+            }
+            // serial dispatch to parallel solver for large islands (if any)
+            solveIsland(solverParams.m_solverMt, *island, solverParams);
+            ++iBegin;
         }
-        ++iBegin;
     }
-    // serial dispatch for large islands (if any)
-    dispatcher.forLoop(0, iBegin);
-    // parallel dispatch for rest
+    // parallel dispatch to sequential solvers for rest
     btParallelFor( iBegin, islandsPtr->size(), 1, dispatcher );
 }
 
@@ -643,15 +649,14 @@ void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray<I
 void btSimulationIslandManagerMt::buildAndProcessIslands( btDispatcher* dispatcher,
                                                         btCollisionWorld* collisionWorld,
                                                         btAlignedObjectArray<btTypedConstraint*>& constraints,
-                                                        IslandCallback* callback
+                                                        const SolverParams& solverParams
                                                         )
 {
+	BT_PROFILE("buildAndProcessIslands");
 	btCollisionObjectArray& collisionObjects = collisionWorld->getCollisionObjectArray();
 
 	buildIslands(dispatcher,collisionWorld);
 
-	BT_PROFILE("processIslands");
-
 	if(!getSplitIslands())
 	{
         btPersistentManifold** manifolds = dispatcher->getInternalManifoldPointer();
@@ -683,14 +688,17 @@ void btSimulationIslandManagerMt::buildAndProcessIslands( btDispatcher* dispatch
             }
         }
         btTypedConstraint** constraintsPtr = constraints.size() ? &constraints[ 0 ] : NULL;
-		callback->processIsland(&collisionObjects[0],
-                                 collisionObjects.size(),
-                                 manifolds,
-                                 maxNumManifolds,
-                                 constraintsPtr,
-                                 constraints.size(),
-                                 -1
-                                 );
+        btConstraintSolver* solver = solverParams.m_solverMt ? solverParams.m_solverMt : solverParams.m_solverPool;
+        solver->solveGroup(&collisionObjects[0],
+                           collisionObjects.size(),
+                           manifolds,
+                           maxNumManifolds,
+                           constraintsPtr,
+                           constraints.size(),
+                           *solverParams.m_solverInfo,
+                           solverParams.m_debugDrawer,
+                           solverParams.m_dispatcher
+                           );
 	}
 	else
 	{
@@ -710,6 +718,6 @@ void btSimulationIslandManagerMt::buildAndProcessIslands( btDispatcher* dispatch
             mergeIslands();
         }
         // dispatch islands to solver
-        m_islandDispatch( &m_activeIslands, callback );
+        m_islandDispatch( &m_activeIslands, solverParams );
 	}
 }
diff --git a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
index 31a2053b4..563577a6f 100644
--- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
+++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
@@ -19,7 +19,9 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btSimulationIslandManager.h"
 
 class btTypedConstraint;
-
+class btConstraintSolver;
+struct btContactSolverInfo;
+class btIDebugDraw;
 
 ///
 /// SimulationIslandManagerMt -- Multithread capable version of SimulationIslandManager
@@ -45,22 +47,19 @@ public:
 
         void append( const Island& other );  // add bodies, manifolds, constraints to my own
     };
-    struct	IslandCallback
+    struct SolverParams
     {
-        virtual ~IslandCallback() {};
-
-        virtual	void processIsland( btCollisionObject** bodies,
-                                    int numBodies,
-                                    btPersistentManifold** manifolds,
-                                    int numManifolds,
-                                    btTypedConstraint** constraints,
-                                    int numConstraints,
-                                    int islandId
-                                    ) = 0;
+        btConstraintSolver*		m_solverPool;
+        btConstraintSolver*		m_solverMt;
+        btContactSolverInfo*	m_solverInfo;
+        btIDebugDraw*			m_debugDrawer;
+        btDispatcher*			m_dispatcher;
     };
-    typedef void( *IslandDispatchFunc ) ( btAlignedObjectArray<Island*>* islands, IslandCallback* callback );
-    static void serialIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback );
-    static void parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback );
+    static void solveIsland(btConstraintSolver* solver, Island& island, const SolverParams& solverParams);
+
+    typedef void( *IslandDispatchFunc ) ( btAlignedObjectArray<Island*>* islands, const SolverParams& solverParams );
+    static void serialIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, const SolverParams& solverParams );
+    static void parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, const SolverParams& solverParams );
 protected:
     btAlignedObjectArray<Island*> m_allocatedIslands;  // owner of all Islands
     btAlignedObjectArray<Island*> m_activeIslands;  // islands actively in use
@@ -83,7 +82,11 @@ public:
 	btSimulationIslandManagerMt();
 	virtual ~btSimulationIslandManagerMt();
 
-    virtual void buildAndProcessIslands( btDispatcher* dispatcher, btCollisionWorld* collisionWorld, btAlignedObjectArray<btTypedConstraint*>& constraints, IslandCallback* callback );
+    virtual void buildAndProcessIslands( btDispatcher* dispatcher,
+        btCollisionWorld* collisionWorld,
+        btAlignedObjectArray<btTypedConstraint*>& constraints,
+        const SolverParams& solverParams
+    );
 
 	virtual void buildIslands(btDispatcher* dispatcher,btCollisionWorld* colWorld);
 
@@ -106,7 +109,6 @@ public:
     }
 };
 
-extern int gLargeIslandManifoldCount;
 
 #endif //BT_SIMULATION_ISLAND_MANAGER_H
 

From eec478709afa8220da9ddbb8888f4fd64cf90e8e Mon Sep 17 00:00:00 2001
From: Lunkhound <lunkhound@gmail.com>
Date: Tue, 6 Mar 2018 02:28:23 -0800
Subject: [PATCH 5/8] parallel solver: small tweaks and fixes

---
 .../ConstraintSolver/btBatchedConstraints.cpp | 24 +++++--------
 .../btSequentialImpulseConstraintSolverMt.cpp |  2 +-
 .../TaskScheduler/btTaskScheduler.cpp         | 34 ++-----------------
 .../TaskScheduler/btThreadSupportWin32.cpp    |  4 +--
 4 files changed, 14 insertions(+), 50 deletions(-)

diff --git a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
index a27350bf9..310601659 100644
--- a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
+++ b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
@@ -38,11 +38,10 @@ struct btBatchedConstraintInfo
 
 struct btBatchInfo
 {
-    int phaseId;
     int numConstraints;
     int mergeIndex;
 
-    btBatchInfo(int _phaseId = -1) : numConstraints(0), mergeIndex(-1), phaseId(_phaseId) {}
+    btBatchInfo() : numConstraints(0), mergeIndex(kNoMerge) {}
 };
 
 
@@ -728,7 +727,6 @@ struct AssignConstraintsToGridBatchesParams
     btIntVec3* bodyGridCoords;
     int numBodies;
     btBatchedConstraintInfo* conInfos;
-    char* constraintPhaseIds;
     int* constraintBatchIds;
     btIntVec3 gridChunkDim;
     int maxNumBatchesPerPhase;
@@ -807,7 +805,6 @@ static void assignConstraintsToGridBatches(const AssignConstraintsToGridBatchesP
         }
         int iBatch = iPhase * params.maxNumBatchesPerPhase + chunkCoord[ 0 ] + chunkCoord[ 1 ] * gridChunkDim[ 0 ] + chunkCoord[ 2 ] * gridChunkDim[ 0 ] * gridChunkDim[ 1 ];
         btAssert(iBatch >= 0 && iBatch < params.maxNumBatchesPerPhase*params.numPhases);
-        params.constraintPhaseIds[ iCon ] = iPhase;
         params.constraintBatchIds[ iCon ] = iBatch;
     }
 }
@@ -834,8 +831,7 @@ struct AssignConstraintsToGridBatchesLoop : public btIParallelForBody
 /*
 
 Bodies are treated as 3D points at their center of mass. We only consider dynamic bodies at this stage,
-kinematic and static bodies are dealt with at a later stage. Also we only consider constraints that
-are between 2 dynamic bodies ("dynamic" constraints) -- constraints that involve a static or kinematic body are handled later
+because only dynamic bodies are mutated when a constraint is solved, thus subject to race conditions.
 
 1. Compute a bounding box around all dynamic bodies
 2. Compute the maximum extent of all dynamic constraints. Each dynamic constraint is treated as a line segment, and we need the size of
@@ -845,15 +841,16 @@ are between 2 dynamic bodies ("dynamic" constraints) -- constraints that involve
    so that no dynamic constraint can span more than 2 cells of our grid on any axis of the grid. The cell size should be adjusted
    larger in order to keep the total number of cells from being excessively high
 
-Key idea: Given that each constraint spans 1 or 2 grid cells in each dimension, we can handle all dynamic constraints by processing
+Key idea: Given that each constraint spans 1 or 2 grid cells in each dimension, we can handle all constraints by processing
           in chunks of 2x2x2 cells with 8 different 1-cell offsets ((0,0,0),(0,0,1),(0,1,0),(0,1,1),(1,0,0)...).
           For each of the 8 offsets, we create a phase, and for each 2x2x2 chunk with dynamic constraints becomes a batch in that phase.
 
- Once all of the phases have been populated, if any of the phases end up with too few batches, they could possibly be merged with other phases.
+4. Once the grid is established, we can calculate for each constraint which phase and batch it belongs in.
 
- Finally, we handle all of the remaining (non-dynamic) constraints, these can be added to whichever phase is least populated to help
- even things out
+5. Do a merge small batches on the batches of each phase separately, to try to even out the sizes of batches
 
+Optionally, we can "collapse" one dimension of our 3D grid to turn it into a 2D grid, which reduces the number of phases
+to 4. With fewer phases, there are more constraints per phase and this makes it easier to create batches of a useful size.
 */
 //
 static void setupSpatialGridBatchesMt(
@@ -882,7 +879,6 @@ static void setupSpatialGridBatchesMt(
     btBatchInfo* batches = NULL;
     int* batchWork = NULL;
     btBatchedConstraintInfo* conInfos = NULL;
-    char* constraintPhaseIds = NULL;
     int* constraintBatchIds = NULL;
     int* constraintRowBatchIds = NULL;
     {
@@ -893,7 +889,6 @@ static void setupSpatialGridBatchesMt(
         memHelper.addChunk( (void**) &batches, sizeof( btBatchInfo )* allocNumBatches );
         memHelper.addChunk( (void**) &batchWork, sizeof( int )* allocNumBatches );
         memHelper.addChunk( (void**) &conInfos, sizeof( btBatchedConstraintInfo ) * numConstraints );
-        memHelper.addChunk( (void**) &constraintPhaseIds, sizeof( char ) * numConstraints );
         memHelper.addChunk( (void**) &constraintBatchIds, sizeof( int ) * numConstraints );
         memHelper.addChunk( (void**) &constraintRowBatchIds, sizeof( int ) * numConstraintRows );
         size_t scratchSize = memHelper.getSizeToAllocate();
@@ -1010,7 +1005,7 @@ static void setupSpatialGridBatchesMt(
         for ( int iBatch = batchBegin; iBatch < batchEnd; ++iBatch )
         {
             btBatchInfo& batch = batches[ iBatch ];
-            batch = btBatchInfo( iPhase );
+            batch = btBatchInfo();
         }
     }
 
@@ -1020,7 +1015,6 @@ static void setupSpatialGridBatchesMt(
         params.bodyGridCoords = bodyGridCoords;
         params.numBodies = bodies.size();
         params.conInfos = conInfos;
-        params.constraintPhaseIds = constraintPhaseIds;
         params.constraintBatchIds = constraintBatchIds;
         params.gridChunkDim = gridChunkDim;
         params.maxNumBatchesPerPhase = maxNumBatchesPerPhase;
@@ -1030,7 +1024,7 @@ static void setupSpatialGridBatchesMt(
         if (inParallel)
         {
             AssignConstraintsToGridBatchesLoop loop(params);
-            int grainSize = 500;
+            int grainSize = 250;
             btParallelFor(0, numConstraints, grainSize, loop);
         }
         else
diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
index b09665b15..b9ad17a03 100644
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
@@ -940,7 +940,7 @@ void btSequentialImpulseConstraintSolverMt::solveGroupCacheFriendlySplitImpulseI
                 {
                     int iPhase = batchedCons.m_phaseOrder[ iiPhase ];
                     const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ];
-                    int grainSize = 8;
+                    int grainSize = batchedCons.m_phaseGrainSize[iPhase];
                     leastSquaresResidual += btParallelSum( phase.begin, phase.end, grainSize, loop );
                 }
             }
diff --git a/src/LinearMath/TaskScheduler/btTaskScheduler.cpp b/src/LinearMath/TaskScheduler/btTaskScheduler.cpp
index e02458367..1aa7d44d4 100644
--- a/src/LinearMath/TaskScheduler/btTaskScheduler.cpp
+++ b/src/LinearMath/TaskScheduler/btTaskScheduler.cpp
@@ -17,42 +17,12 @@ typedef void* ( *btThreadLocalStorageFunc )();
 
 
 
-///
-/// getNumHardwareThreads()
-///
-///
-/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
-///
-#if __cplusplus >= 201103L
-
-#include <thread>
-
-int getNumHardwareThreads()
-{
-    return std::thread::hardware_concurrency();
-}
-
-#elif defined( _WIN32 )
+#if defined( _WIN32 )
 
 #define WIN32_LEAN_AND_MEAN
 
 #include <windows.h>
 
-int getNumHardwareThreads()
-{
-    // caps out at 32
-    SYSTEM_INFO info;
-    GetSystemInfo( &info );
-    return info.dwNumberOfProcessors;
-}
-
-#else
-
-int getNumHardwareThreads()
-{
-    return 0;  // don't know
-}
-
 #endif
 
 
@@ -581,7 +551,6 @@ public:
 
             // put the main thread to work on emptying the job queue and then wait for all workers to finish
             waitJobs();
-            m_antiNestingLock.unlock();
 
             // add up all the thread sums
             btScalar sum = btScalar(0);
@@ -589,6 +558,7 @@ public:
             {
                 sum += threadLocalSum[ iThread ].mSum;
             }
+            m_antiNestingLock.unlock();
             return sum;
         }
         else
diff --git a/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp b/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp
index f77616337..de693590e 100644
--- a/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp
+++ b/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp
@@ -267,8 +267,8 @@ DWORD WINAPI win32threadStartFunc( LPVOID lpParam )
 void btThreadSupportWin32::runTask( int threadIndex, void* userData )
 {
     btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ];
-    btAssert( taskId >= 0 );
-    btAssert( int( taskId ) < m_activeThreadStatus.size() );
+    btAssert( threadIndex >= 0 );
+    btAssert( int( threadIndex ) < m_activeThreadStatus.size() );
 
     threadStatus.m_commandId = 1;
     threadStatus.m_status = 1;

From e526e48df837d282f4a59554750769b21bbc2104 Mon Sep 17 00:00:00 2001
From: Lunkhound <lunkhound@gmail.com>
Date: Tue, 13 Mar 2018 04:19:02 -0700
Subject: [PATCH 6/8] parallel solver: slightly overallocate to reduce how
 often allocation is needed

---
 .../ConstraintSolver/btBatchedConstraints.cpp |  6 ++++++
 .../btSequentialImpulseConstraintSolverMt.cpp | 20 +++++++++++++++----
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
index 310601659..84a00dc63 100644
--- a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
+++ b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
@@ -892,6 +892,12 @@ static void setupSpatialGridBatchesMt(
         memHelper.addChunk( (void**) &constraintBatchIds, sizeof( int ) * numConstraints );
         memHelper.addChunk( (void**) &constraintRowBatchIds, sizeof( int ) * numConstraintRows );
         size_t scratchSize = memHelper.getSizeToAllocate();
+        // if we need to reallocate
+        if (scratchMemory->capacity() < scratchSize)
+        {
+            // allocate 6.25% extra to avoid repeated reallocs
+            scratchMemory->reserve( scratchSize + scratchSize/16 );
+        }
         scratchMemory->resizeNoInitialize( scratchSize );
         char* memPtr = &scratchMemory->at(0);
         memHelper.setChunkPointers( memPtr );
diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
index b9ad17a03..4ccf7b247 100644
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
@@ -568,10 +568,22 @@ void btSequentialImpulseConstraintSolverMt::allocAllContactConstraints(btPersist
                 }
             }
         }
-        m_tmpSolverContactConstraintPool.resizeNoInitialize(numContacts);
-        m_rollingFrictionIndexTable.resizeNoInitialize(numContacts);
-        m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize(numContacts*m_numFrictionDirections);
-        m_tmpSolverContactRollingFrictionConstraintPool.resizeNoInitialize(numRollingFrictionConstraints);
+        {
+            BT_PROFILE( "allocPools" );
+            if ( m_tmpSolverContactConstraintPool.capacity() < numContacts )
+            {
+                // if we need to reallocate, reserve some extra so we don't have to reallocate again next frame
+                int extraReserve = numContacts / 16;
+                m_tmpSolverContactConstraintPool.reserve( numContacts + extraReserve );
+                m_rollingFrictionIndexTable.reserve( numContacts + extraReserve );
+                m_tmpSolverContactFrictionConstraintPool.reserve( ( numContacts + extraReserve )*m_numFrictionDirections );
+                m_tmpSolverContactRollingFrictionConstraintPool.reserve( numRollingFrictionConstraints + extraReserve );
+            }
+            m_tmpSolverContactConstraintPool.resizeNoInitialize( numContacts );
+            m_rollingFrictionIndexTable.resizeNoInitialize( numContacts );
+            m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize( numContacts*m_numFrictionDirections );
+            m_tmpSolverContactRollingFrictionConstraintPool.resizeNoInitialize( numRollingFrictionConstraints );
+        }
     }
     {
         AllocContactConstraintsLoop loop(this, &cachedInfoArray[0]);

From 04e0d57dc1272390eee711f354e3a6879855d7b2 Mon Sep 17 00:00:00 2001
From: Lunkhound <lunkhound@gmail.com>
Date: Fri, 16 Mar 2018 23:42:43 -0700
Subject: [PATCH 7/8] add premake option 'enable-multithreading'

---
 build3/premake4.lua         | 8 ++++++++
 src/LinearMath/premake4.lua | 4 +++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/build3/premake4.lua b/build3/premake4.lua
index a39e926e5..d49b9d440 100644
--- a/build3/premake4.lua
+++ b/build3/premake4.lua
@@ -182,6 +182,14 @@ end
 		trigger = "audio",
 		description = "Enable audio"
 	}
+	newoption
+	{
+		trigger = "enable_multithreading",
+		description = "enable CPU multithreading for bullet2 libs"
+	}
+	if _OPTIONS["enable_multithreading"] then
+		defines {"BT_THREADSAFE=1"}
+	end
 	if _OPTIONS["double"] then
 		defines {"BT_USE_DOUBLE_PRECISION"}
 	end
diff --git a/src/LinearMath/premake4.lua b/src/LinearMath/premake4.lua
index 5f0fda6bf..3765811a9 100644
--- a/src/LinearMath/premake4.lua
+++ b/src/LinearMath/premake4.lua
@@ -9,5 +9,7 @@
 	}
 	files {
 		"*.cpp",
-		"*.h"
+		"*.h",
+		"TaskScheduler/*.cpp",
+		"TaskScheduler/*.h"
 	}

From bdc3c2bafb2d51f954da3b203233bc27b9224959 Mon Sep 17 00:00:00 2001
From: Lunkhound <lunkhound@gmail.com>
Date: Fri, 16 Mar 2018 16:38:11 -0700
Subject: [PATCH 8/8] task scheduler: add multiple job queues to improve
 performance when there are many threads

---
 .../TaskScheduler/btTaskScheduler.cpp         | 557 ++++++++++++------
 .../TaskScheduler/btThreadSupportInterface.h  |   9 +-
 .../TaskScheduler/btThreadSupportPosix.cpp    |  11 +-
 .../TaskScheduler/btThreadSupportWin32.cpp    |  14 +-
 4 files changed, 386 insertions(+), 205 deletions(-)

diff --git a/src/LinearMath/TaskScheduler/btTaskScheduler.cpp b/src/LinearMath/TaskScheduler/btTaskScheduler.cpp
index 1aa7d44d4..02fe07ab1 100644
--- a/src/LinearMath/TaskScheduler/btTaskScheduler.cpp
+++ b/src/LinearMath/TaskScheduler/btTaskScheduler.cpp
@@ -7,16 +7,11 @@
 #include <algorithm>
 
 
-typedef void( *btThreadFunc )( void* userPtr, void* lsMemory );
-typedef void* ( *btThreadLocalStorageFunc )();
 
 #if BT_THREADSAFE
 
 #include "btThreadSupportInterface.h"
 
-
-
-
 #if defined( _WIN32 )
 
 #define WIN32_LEAN_AND_MEAN
@@ -26,6 +21,9 @@ typedef void* ( *btThreadLocalStorageFunc )();
 #endif
 
 
+typedef unsigned long long btU64;
+static const int kCacheLineSize = 64;
+
 void btSpinPause()
 {
 #if defined( _WIN32 )
@@ -46,6 +44,62 @@ struct WorkerThreadStatus
 };
 
 
+ATTRIBUTE_ALIGNED64(class) WorkerThreadDirectives
+{
+    static const int kMaxThreadCount = BT_MAX_THREAD_COUNT;
+    // directives for all worker threads packed into a single cacheline
+    char m_threadDirs[kMaxThreadCount];
+
+public:
+    enum Type
+    {
+        kInvalid,
+        kGoToSleep,         // go to sleep
+        kStayAwakeButIdle,  // wait for not checking job queue
+        kScanForJobs,       // actively scan job queue for jobs
+    };
+    WorkerThreadDirectives()
+    {
+        for ( int i = 0; i < kMaxThreadCount; ++i )
+        {
+            m_threadDirs[ i ] = 0;
+        }
+    }
+
+    Type getDirective(int threadId)
+    {
+        btAssert(threadId < kMaxThreadCount);
+        return static_cast<Type>(m_threadDirs[threadId]);
+    }
+
+    void setDirectiveByRange(int threadBegin, int threadEnd, Type dir)
+    {
+        btAssert( threadBegin < threadEnd );
+        btAssert( threadEnd <= kMaxThreadCount );
+        char dirChar = static_cast<char>(dir);
+        for ( int i = threadBegin; i < threadEnd; ++i )
+        {
+            m_threadDirs[ i ] = dirChar;
+        }
+    }
+};
+
+class JobQueue;
+
+ATTRIBUTE_ALIGNED64(struct) ThreadLocalStorage
+{
+    int m_threadId;
+    WorkerThreadStatus::Type m_status;
+    int m_numJobsFinished;
+    btSpinMutex m_mutex;
+    btScalar m_sumResult;
+    WorkerThreadDirectives * m_directive;
+    JobQueue* m_queue;
+    btClock* m_clock;
+    unsigned int m_cooldownTime;
+};
+
+
 struct IJob
 {
     virtual void executeJob(int threadId) = 0;
@@ -53,88 +107,152 @@ struct IJob
 
 class ParallelForJob : public IJob
 {
-    const btIParallelForBody* mBody;
-    int mBegin;
-    int mEnd;
+    const btIParallelForBody* m_body;
+    int m_begin;
+    int m_end;
 
 public:
     ParallelForJob( int iBegin, int iEnd, const btIParallelForBody& body )
     {
-        mBody = &body;
-        mBegin = iBegin;
-        mEnd = iEnd;
+        m_body = &body;
+        m_begin = iBegin;
+        m_end = iEnd;
     }
     virtual void executeJob(int threadId) BT_OVERRIDE
     {
         BT_PROFILE( "executeJob" );
 
         // call the functor body to do the work
-        mBody->forLoop( mBegin, mEnd );
+        m_body->forLoop( m_begin, m_end );
     }
 };
 
-static const int kCacheLineSize = 64;
-
-struct ThreadLocalSum
-{
-    btScalar mSum;
-    char mCachePadding[ kCacheLineSize - sizeof( btScalar ) ];
-};
 
 class ParallelSumJob : public IJob
 {
-    const btIParallelSumBody* mBody;
-    ThreadLocalSum* mSumArray;
-    int mBegin;
-    int mEnd;
+    const btIParallelSumBody* m_body;
+    ThreadLocalStorage* m_threadLocalStoreArray;
+    int m_begin;
+    int m_end;
 
 public:
-    ParallelSumJob( int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalSum* sums )
+    ParallelSumJob( int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalStorage* tls )
     {
-        mBody = &body;
-        mSumArray = sums;
-        mBegin = iBegin;
-        mEnd = iEnd;
+        m_body = &body;
+        m_threadLocalStoreArray = tls;
+        m_begin = iBegin;
+        m_end = iEnd;
     }
     virtual void executeJob( int threadId ) BT_OVERRIDE
     {
         BT_PROFILE( "executeJob" );
 
         // call the functor body to do the work
-        btScalar val = mBody->sumLoop( mBegin, mEnd );
+        btScalar val = m_body->sumLoop( m_begin, m_end );
+#if BT_PARALLEL_SUM_DETERMINISTISM
         // by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision)
         const float TRUNC_SCALE = float(1<<19);
         val = floor(val*TRUNC_SCALE+0.5f)/TRUNC_SCALE;  // truncate some bits
-        mSumArray[threadId].mSum += val;
+#endif
+        m_threadLocalStoreArray[threadId].m_sumResult += val;
     }
 };
 
 
-struct JobContext
+ATTRIBUTE_ALIGNED64(class) JobQueue
 {
-    JobContext()
-    {
-        m_queueLock = NULL;
-        m_headIndex = 0;
-        m_tailIndex = 0;
-        m_workersShouldCheckQueue = false;
-        m_workersShouldSleep = false;
-        m_useSpinMutex = false;
-        m_coolDownTime = 1000; // 1000 microseconds
-    }
+    btThreadSupportInterface* m_threadSupport;
     btCriticalSection* m_queueLock;
     btSpinMutex m_mutex;
-    volatile bool m_workersShouldCheckQueue;
-    volatile bool m_workersShouldSleep;
 
     btAlignedObjectArray<IJob*> m_jobQueue;
+    char* m_jobMem;
+    int m_jobMemSize;
     bool m_queueIsEmpty;
     int m_tailIndex;
     int m_headIndex;
+    int m_allocSize;
     bool m_useSpinMutex;
-    unsigned int m_coolDownTime;
-    btClock m_clock;
+    btAlignedObjectArray<JobQueue*> m_neighborContexts;
+    char m_cachePadding[kCacheLineSize];  // prevent false sharing
 
+    void freeJobMem()
+    {
+        if ( m_jobMem )
+        {
+            // free old
+            btAlignedFree(m_jobMem);
+            m_jobMem = NULL;
+        }
+    }
+    void resizeJobMem(int newSize)
+    {
+        if (newSize > m_jobMemSize)
+        {
+            freeJobMem();
+            m_jobMem = static_cast<char*>(btAlignedAlloc(newSize, kCacheLineSize));
+            m_jobMemSize = newSize;
+        }
+    }
+
+public:
+
+    JobQueue()
+    {
+        m_jobMem = NULL;
+        m_jobMemSize = 0;
+        m_threadSupport = NULL;
+        m_queueLock = NULL;
+        m_headIndex = 0;
+        m_tailIndex = 0;
+        m_useSpinMutex = false;
+    }
+    ~JobQueue()
+    {
+        freeJobMem();
+        if (m_queueLock && m_threadSupport)
+        {
+            m_threadSupport->deleteCriticalSection(m_queueLock);
+            m_queueLock = NULL;
+        }
+    }
+    void init(btThreadSupportInterface* threadSup, btAlignedObjectArray<JobQueue>* contextArray)
+    {
+        m_threadSupport = threadSup;
+        if (threadSup)
+        {
+            m_queueLock = m_threadSupport->createCriticalSection();
+        }
+        setupJobStealing(contextArray, contextArray->size());
+    }
+    void setupJobStealing(btAlignedObjectArray<JobQueue>* contextArray, int numActiveContexts)
+    {
+        btAlignedObjectArray<JobQueue>& contexts = *contextArray;
+        int selfIndex = 0;
+        for (int i = 0; i < contexts.size(); ++i)
+        {
+            if ( this == &contexts[ i ] )
+            {
+                selfIndex = i;
+                break;
+            }
+        }
+        int numNeighbors = btMin(2, contexts.size() - 1);
+        int neighborOffsets[ ] = {-1, 1, -2, 2, -3, 3};
+        int numOffsets = sizeof(neighborOffsets)/sizeof(neighborOffsets[0]);
+        m_neighborContexts.reserve( numNeighbors );
+        m_neighborContexts.resizeNoInitialize(0);
+        for (int i = 0; i < numOffsets && m_neighborContexts.size() < numNeighbors; i++)
+        {
+            int neighborIndex = selfIndex + neighborOffsets[i];
+            if ( neighborIndex >= 0 && neighborIndex < numActiveContexts)
+            {
+                m_neighborContexts.push_back( &contexts[ neighborIndex ] );
+            }
+        }
+    }
+
+    bool isQueueEmpty() const {return m_queueIsEmpty;}
     void lockQueue()
     {
         if ( m_useSpinMutex )
@@ -157,24 +275,44 @@ struct JobContext
             m_queueLock->unlock();
         }
     }
-    void clearQueue()
+    void clearQueue(int jobCount, int jobSize)
     {
         lockQueue();
         m_headIndex = 0;
         m_tailIndex = 0;
+        m_allocSize = 0;
         m_queueIsEmpty = true;
+        int jobBufSize = jobSize * jobCount;
+        // make sure we have enough memory allocated to store jobs
+        if ( jobBufSize > m_jobMemSize )
+        {
+            resizeJobMem( jobBufSize );
+        }
+        // make sure job queue is big enough
+        if ( jobCount > m_jobQueue.capacity() )
+        {
+            m_jobQueue.reserve( jobCount );
+        }
         unlockQueue();
         m_jobQueue.resizeNoInitialize( 0 );
     }
+    void* allocJobMem(int jobSize)
+    {
+        btAssert(m_jobMemSize >= (m_allocSize + jobSize));
+        void* jobMem = &m_jobMem[m_allocSize];
+        m_allocSize += jobSize;
+        return jobMem;
+    }
     void submitJob( IJob* job )
     {
+        btAssert( reinterpret_cast<char*>( job ) >= &m_jobMem[ 0 ] && reinterpret_cast<char*>( job ) < &m_jobMem[ 0 ] + m_allocSize );
         m_jobQueue.push_back( job );
         lockQueue();
         m_tailIndex++;
         m_queueIsEmpty = false;
         unlockQueue();
     }
-    IJob* consumeJob()
+    IJob* consumeJobFromOwnQueue()
     {
         if ( m_queueIsEmpty )
         {
@@ -186,6 +324,7 @@ struct JobContext
         if ( !m_queueIsEmpty )
         {
             job = m_jobQueue[ m_headIndex++ ];
+            btAssert( reinterpret_cast<char*>( job ) >= &m_jobMem[ 0 ] && reinterpret_cast<char*>( job ) < &m_jobMem[ 0 ] + m_allocSize );
             if ( m_headIndex == m_tailIndex )
             {
                 m_queueIsEmpty = true;
@@ -194,58 +333,78 @@ struct JobContext
         unlockQueue();
         return job;
     }
+    IJob* consumeJob()
+    {
+        if (IJob* job = consumeJobFromOwnQueue())
+        {
+            return job;
+        }
+        // own queue is empty, try to steal from neighbor
+        for (int i = 0; i < m_neighborContexts.size(); ++i)
+        {
+            JobQueue* otherContext = m_neighborContexts[ i ];
+            if ( IJob* job = otherContext->consumeJobFromOwnQueue() )
+            {
+                return job;
+            }
+        }
+        return NULL;
+    }
 };
 
 
-struct WorkerThreadLocalStorage
-{
-    int threadId;
-    WorkerThreadStatus::Type status;
-    int numJobsFinished;
-    btSpinMutex m_mutex;
-};
-
-
-static void WorkerThreadFunc( void* userPtr, void* lsMemory )
+static void WorkerThreadFunc( void* userPtr )
 {
     BT_PROFILE( "WorkerThreadFunc" );
-    WorkerThreadLocalStorage* localStorage = (WorkerThreadLocalStorage*) lsMemory;
-    JobContext* jobContext = (JobContext*) userPtr;
+    ThreadLocalStorage* localStorage = (ThreadLocalStorage*) userPtr;
+    JobQueue* jobQueue = localStorage->m_queue;
 
     bool shouldSleep = false;
+    int threadId = localStorage->m_threadId;
     while (! shouldSleep)
     {
         // do work
         localStorage->m_mutex.lock();
-        while ( IJob* job = jobContext->consumeJob() )
+        while ( IJob* job = jobQueue->consumeJob() )
         {
-            localStorage->status = WorkerThreadStatus::kWorking;
-            job->executeJob( localStorage->threadId );
-            localStorage->numJobsFinished++;
+            localStorage->m_status = WorkerThreadStatus::kWorking;
+            job->executeJob( threadId );
+            localStorage->m_numJobsFinished++;
         }
-        localStorage->status = WorkerThreadStatus::kWaitingForWork;
+        localStorage->m_status = WorkerThreadStatus::kWaitingForWork;
         localStorage->m_mutex.unlock();
-        unsigned long long int clockStart = jobContext->m_clock.getTimeMicroseconds();
+        btU64 clockStart = localStorage->m_clock->getTimeMicroseconds();
         // while queue is empty,
-        while (jobContext->m_queueIsEmpty)
+        while (jobQueue->isQueueEmpty())
         {
             // todo: spin wait a bit to avoid hammering the empty queue
             btSpinPause();
-            if ( jobContext->m_workersShouldSleep )
+            if ( localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kGoToSleep )
             {
                 shouldSleep = true;
                 break;
             }
             // if jobs are incoming,
-            if (jobContext->m_workersShouldCheckQueue)
+            if ( localStorage->m_directive->getDirective( threadId ) == WorkerThreadDirectives::kScanForJobs )
             {
-                clockStart = jobContext->m_clock.getTimeMicroseconds(); // reset clock
+                clockStart = localStorage->m_clock->getTimeMicroseconds(); // reset clock
             }
             else
             {
+                for ( int i = 0; i < 50; ++i )
+                {
+                    btSpinPause();
+                    btSpinPause();
+                    btSpinPause();
+                    btSpinPause();
+                    if (localStorage->m_directive->getDirective( threadId ) == WorkerThreadDirectives::kScanForJobs || !jobQueue->isQueueEmpty())
+                    {
+                        break;
+                    }
+                }
                 // if no jobs incoming and queue has been empty for the cooldown time, sleep
-                unsigned long long int timeElapsed = jobContext->m_clock.getTimeMicroseconds() - clockStart;
-                if (timeElapsed > jobContext->m_coolDownTime)
+                btU64 timeElapsed = localStorage->m_clock->getTimeMicroseconds() - clockStart;
+                if (timeElapsed > localStorage->m_cooldownTime)
                 {
                     shouldSleep = true;
                     break;
@@ -254,77 +413,107 @@ static void WorkerThreadFunc( void* userPtr, void* lsMemory )
         }
     }
 
-    // go idle
+    // go sleep
     localStorage->m_mutex.lock();
-    localStorage->status = WorkerThreadStatus::kSleeping;
+    localStorage->m_status = WorkerThreadStatus::kSleeping;
     localStorage->m_mutex.unlock();
 }
 
 
-static void* WorkerThreadAllocFunc()
-{
-    return new WorkerThreadLocalStorage;
-}
-
-
-
 class btTaskSchedulerDefault : public btITaskScheduler
 {
-    JobContext m_jobContext;
     btThreadSupportInterface* m_threadSupport;
-    btAlignedObjectArray<char> m_jobMem;
-    btAlignedObjectArray<char> m_threadLocalMem;
+    WorkerThreadDirectives* m_workerDirective;
+    btAlignedObjectArray<JobQueue> m_jobQueues;
+    btAlignedObjectArray<JobQueue*> m_perThreadJobQueues;
+    btAlignedObjectArray<ThreadLocalStorage> m_threadLocalStorage;
     btSpinMutex m_antiNestingLock;  // prevent nested parallel-for
+    btClock m_clock;
     int m_numThreads;
     int m_numWorkerThreads;
+    int m_numActiveJobQueues;
     int m_maxNumThreads;
     int m_numJobs;
+    static const int kFirstWorkerThreadId = 1;
 public:
 
     btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport")
     {
         m_threadSupport = NULL;
+        m_workerDirective = NULL;
     }
 
     virtual ~btTaskSchedulerDefault()
     {
-        shutdown();
+        waitForWorkersToSleep();
+        if (m_threadSupport)
+        {
+            delete m_threadSupport;
+            m_threadSupport = NULL;
+        }
+        if (m_workerDirective)
+        {
+            btAlignedFree(m_workerDirective);
+            m_workerDirective = NULL;
+        }
     }
 
     void init()
     {
-        btThreadSupportInterface::ConstructionInfo constructionInfo( "TaskScheduler", WorkerThreadFunc, WorkerThreadAllocFunc );
+        btThreadSupportInterface::ConstructionInfo constructionInfo( "TaskScheduler", WorkerThreadFunc );
         m_threadSupport = btThreadSupportInterface::create( constructionInfo );
+        m_workerDirective = static_cast<WorkerThreadDirectives*>(btAlignedAlloc(sizeof(*m_workerDirective), 64));
 
         m_numWorkerThreads = m_threadSupport->getNumWorkerThreads();
         m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1;
         m_numThreads = m_maxNumThreads;
-        m_jobContext.m_queueLock = m_threadSupport->createCriticalSection();
-        for ( int i = 0; i < m_numWorkerThreads; i++ )
+        // ideal to have one job queue for each physical processor (except for the main thread which needs no queue)
+        int numThreadsPerQueue = m_threadSupport->getLogicalToPhysicalCoreRatio();
+        int numJobQueues = (numThreadsPerQueue == 1) ? (m_maxNumThreads-1) : (m_maxNumThreads / numThreadsPerQueue);
+        m_jobQueues.resize(numJobQueues);
+        m_numActiveJobQueues = numJobQueues;
+        for ( int i = 0; i < m_jobQueues.size(); ++i )
         {
-            WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i );
-            btAssert( storage );
-            storage->threadId = i + 1;  // workers start at 1
-            storage->status = WorkerThreadStatus::kSleeping;
+            m_jobQueues[i].init( m_threadSupport, &m_jobQueues );
         }
-        setWorkersActive( false ); // no work for them yet
+        m_perThreadJobQueues.resize(m_numThreads);
+        for ( int i = 0; i < m_numThreads; i++ )
+        {
+            JobQueue* jq = NULL;
+            // only worker threads get a job queue
+            if (i > 0)
+            {
+                if (numThreadsPerQueue == 1)
+                {
+                    // one queue per worker thread
+                    jq = &m_jobQueues[ i - kFirstWorkerThreadId ];
+                }
+                else
+                {
+                    // 2 threads share each queue
+                    jq = &m_jobQueues[ i / numThreadsPerQueue ];
+                }
+            }
+            m_perThreadJobQueues[i] = jq;
+        }
+        m_threadLocalStorage.resize(m_numThreads);
+        for ( int i = 0; i < m_numThreads; i++ )
+        {
+            ThreadLocalStorage& storage = m_threadLocalStorage[i];
+            storage.m_threadId = i;
+            storage.m_directive = m_workerDirective;
+            storage.m_status = WorkerThreadStatus::kSleeping;
+            storage.m_cooldownTime = 1000; // 1000 microseconds, threads go to sleep after this long if they have nothing to do
+            storage.m_clock = &m_clock;
+            storage.m_queue = m_perThreadJobQueues[i];
+        }
+        setWorkerDirectives( WorkerThreadDirectives::kGoToSleep ); // no work for them yet
         setNumThreads( m_threadSupport->getCacheFriendlyNumThreads() );
     }
 
-    virtual void shutdown()
+    void setWorkerDirectives(WorkerThreadDirectives::Type dir)
     {
-        setWorkersActive( false );
-        waitForWorkersToSleep();
-        m_threadSupport->deleteCriticalSection( m_jobContext.m_queueLock );
-        m_jobContext.m_queueLock = NULL;
-
-        delete m_threadSupport;
-        m_threadSupport = NULL;
-    }
-
-    void setWorkersActive( bool active )
-    {
-        m_jobContext.m_workersShouldCheckQueue = active;
+        m_workerDirective->setDirectiveByRange(kFirstWorkerThreadId, m_numThreads, dir);
     }
 
     virtual int getMaxNumThreads() const BT_OVERRIDE
@@ -341,38 +530,56 @@ public:
     {
         m_numThreads = btMax( btMin(numThreads, int(m_maxNumThreads)), 1 );
         m_numWorkerThreads = m_numThreads - 1;
+        m_numActiveJobQueues = 0;
+        // if there is at least 1 worker,
+        if ( m_numWorkerThreads > 0 )
+        {
+            // re-setup job stealing between queues to avoid attempting to steal from an inactive job queue
+            JobQueue* lastActiveContext = m_perThreadJobQueues[ m_numThreads - 1 ];
+            int iLastActiveContext = lastActiveContext - &m_jobQueues[0];
+            m_numActiveJobQueues = iLastActiveContext + 1;
+            for ( int i = 0; i < m_jobQueues.size(); ++i )
+            {
+                m_jobQueues[ i ].setupJobStealing( &m_jobQueues, m_numActiveJobQueues );
+            }
+        }
+        m_workerDirective->setDirectiveByRange(m_numThreads, BT_MAX_THREAD_COUNT, WorkerThreadDirectives::kGoToSleep);
     }
 
     void waitJobs()
     {
         BT_PROFILE( "waitJobs" );
-        // have the main thread work until the job queue is empty
+        // have the main thread work until the job queues are empty
         int numMainThreadJobsFinished = 0;
-        while ( IJob* job = m_jobContext.consumeJob() )
+        for ( int i = 0; i < m_numActiveJobQueues; ++i )
         {
-            job->executeJob( 0 );
-            numMainThreadJobsFinished++;
+            while ( IJob* job = m_jobQueues[i].consumeJob() )
+            {
+                job->executeJob( 0 );
+                numMainThreadJobsFinished++;
+            }
         }
-        // done with jobs for now, tell workers to rest
-        setWorkersActive( false );
 
-        unsigned long long int clockStart = m_jobContext.m_clock.getTimeMicroseconds();
+        // done with jobs for now, tell workers to rest (but not sleep)
+        setWorkerDirectives( WorkerThreadDirectives::kStayAwakeButIdle );
+
+        btU64 clockStart = m_clock.getTimeMicroseconds();
         // wait for workers to finish any jobs in progress
         while ( true )
         {
             int numWorkerJobsFinished = 0;
-            for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker )
+            for ( int iThread = kFirstWorkerThreadId; iThread < m_numThreads; ++iThread )
             {
-                WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory( iWorker ) );
+                ThreadLocalStorage* storage = &m_threadLocalStorage[iThread];
                 storage->m_mutex.lock();
-                numWorkerJobsFinished += storage->numJobsFinished;
+                numWorkerJobsFinished += storage->m_numJobsFinished;
                 storage->m_mutex.unlock();
             }
             if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs)
             {
                 break;
             }
-            unsigned long long int timeElapsed = m_jobContext.m_clock.getTimeMicroseconds() - clockStart;
+            btU64 timeElapsed = m_clock.getTimeMicroseconds() - clockStart;
             btAssert(timeElapsed < 1000);
             if (timeElapsed > 100000)
             {
@@ -385,25 +592,25 @@ public:
     void wakeWorkers(int numWorkersToWake)
     {
         BT_PROFILE( "wakeWorkers" );
-        btAssert( m_jobContext.m_workersShouldCheckQueue );
+        btAssert( m_workerDirective->getDirective(1) == WorkerThreadDirectives::kScanForJobs );
         int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads);
         int numActiveWorkers = 0;
         for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker )
         {
             // note this count of active workers is not necessarily totally reliable, because a worker thread could be
             // just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare.
-            WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory( iWorker ) );
-            if (storage->status != WorkerThreadStatus::kSleeping)
+            ThreadLocalStorage& storage = m_threadLocalStorage[ kFirstWorkerThreadId + iWorker ];
+            if (storage.m_status != WorkerThreadStatus::kSleeping)
             {
                 numActiveWorkers++;
             }
         }
         for ( int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker )
         {
-            WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory( iWorker ) );
-            if (storage->status == WorkerThreadStatus::kSleeping)
+            ThreadLocalStorage& storage = m_threadLocalStorage[ kFirstWorkerThreadId + iWorker ];
+            if (storage.m_status == WorkerThreadStatus::kSleeping)
             {
-                m_threadSupport->runTask( iWorker, &m_jobContext );
+                m_threadSupport->runTask( iWorker, &storage );
                 numActiveWorkers++;
             }
         }
@@ -412,13 +619,12 @@ public:
     void waitForWorkersToSleep()
     {
         BT_PROFILE( "waitForWorkersToSleep" );
-        m_jobContext.m_workersShouldSleep = true;
+        setWorkerDirectives( WorkerThreadDirectives::kGoToSleep );
         m_threadSupport->waitForAllTasks();
-        for ( int i = 0; i < m_numWorkerThreads; i++ )
+        for ( int i = kFirstWorkerThreadId; i < m_numThreads; i++ )
         {
-            WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory(i) );
-            btAssert( storage );
-            btAssert( storage->status == WorkerThreadStatus::kSleeping );
+            ThreadLocalStorage& storage = m_threadLocalStorage[i];
+            btAssert( storage.m_status == WorkerThreadStatus::kSleeping );
         }
     }
 
@@ -426,20 +632,19 @@ public:
     {
         BT_PROFILE( "sleepWorkerThreadsHint" );
         // hint the task scheduler that we may not be using these threads for a little while
-        m_jobContext.m_workersShouldSleep = true;
+        setWorkerDirectives( WorkerThreadDirectives::kGoToSleep );
     }
 
     void prepareWorkerThreads()
     {
-        for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker )
+        for ( int i = kFirstWorkerThreadId; i < m_numThreads; ++i )
         {
-            WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory( iWorker ) );
-            storage->m_mutex.lock();
-            storage->numJobsFinished = 0;
-            storage->m_mutex.unlock();
+            ThreadLocalStorage& storage = m_threadLocalStorage[i];
+            storage.m_mutex.lock();
+            storage.m_numJobsFinished = 0;
+            storage.m_mutex.unlock();
         }
-        m_jobContext.m_workersShouldSleep = false;
-        setWorkersActive( true );
+        setWorkerDirectives( WorkerThreadDirectives::kScanForJobs );
     }
 
     virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
@@ -455,32 +660,32 @@ public:
             m_numJobs = jobCount;
             btAssert( jobCount >= 2 );  // need more than one job for multithreading
             int jobSize = sizeof( JobType );
-            int jobBufSize = jobSize * jobCount;
-            // make sure we have enough memory allocated to store jobs
-            if ( jobBufSize > m_jobMem.size() )
-            {
-                m_jobMem.resize( jobBufSize );
-            }
-            // make sure job queue is big enough
-            if ( jobCount > m_jobContext.m_jobQueue.capacity() )
-            {
-                m_jobContext.m_jobQueue.reserve( jobCount );
-            }
 
-            m_jobContext.clearQueue();
+            for (int i = 0; i < m_numActiveJobQueues; ++i)
+            {
+                m_jobQueues[i].clearQueue( jobCount, jobSize );
+            }
             // prepare worker threads for incoming work
             prepareWorkerThreads();
             // submit all of the jobs
             int iJob = 0;
-            JobType* jobs = reinterpret_cast<JobType*>( &m_jobMem[ 0 ] );
+            int iThread = kFirstWorkerThreadId;  // first worker thread
             for ( int i = iBegin; i < iEnd; i += grainSize )
             {
                 btAssert( iJob < jobCount );
                 int iE = btMin( i + grainSize, iEnd );
-                JobType& job = jobs[ iJob ];
-                new ( (void*) &job ) ParallelForJob( i, iE, body );  // placement new
-                m_jobContext.submitJob( &job );
+                JobQueue* jq = m_perThreadJobQueues[ iThread ];
+                btAssert(jq);
+                btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues);
+                void* jobMem = jq->allocJobMem(jobSize);
+                JobType* job = new ( jobMem ) ParallelForJob( i, iE, body );  // placement new
+                jq->submitJob( job );
                 iJob++;
+                iThread++;
+                if ( iThread >= m_numThreads )
+                {
+                    iThread = kFirstWorkerThreadId;  // first worker thread
+                }
             }
             wakeWorkers( jobCount - 1 );
 
@@ -508,44 +713,38 @@ public:
             m_numJobs = jobCount;
             btAssert( jobCount >= 2 );  // need more than one job for multithreading
             int jobSize = sizeof( JobType );
-            int jobBufSize = jobSize * jobCount;
-            // make sure we have enough memory allocated to store jobs
-            if ( jobBufSize > m_jobMem.size() )
+            for (int i = 0; i < m_numActiveJobQueues; ++i)
             {
-                m_jobMem.resize( jobBufSize );
-            }
-            // make sure job queue is big enough
-            if ( jobCount > m_jobContext.m_jobQueue.capacity() )
-            {
-                m_jobContext.m_jobQueue.reserve( jobCount );
-            }
-            // make sure thread local area is big enough
-            int threadLocalSize = m_numThreads * sizeof( ThreadLocalSum );
-            if ( threadLocalSize > m_threadLocalMem.size() )
-            {
-                m_threadLocalMem.resize( threadLocalSize );
-            }
-            // initialize summation
-            ThreadLocalSum* threadLocalSum = reinterpret_cast<ThreadLocalSum*>( &m_threadLocalMem[ 0 ] );
-            for ( int iThread = 0; iThread < m_numThreads; ++iThread )
-            {
-                threadLocalSum[ iThread ].mSum = btScalar( 0 );
+                m_jobQueues[i].clearQueue( jobCount, jobSize );
+            }
+
+            // initialize summation
+            for ( int iThread = 0; iThread < m_numThreads; ++iThread )
+            {
+                m_threadLocalStorage[iThread].m_sumResult = btScalar(0);
             }
 
-            m_jobContext.clearQueue();
             // prepare worker threads for incoming work
             prepareWorkerThreads();
             // submit all of the jobs
             int iJob = 0;
-            JobType* jobs = reinterpret_cast<JobType*>( &m_jobMem[ 0 ] );
+            int iThread = kFirstWorkerThreadId;  // first worker thread
             for ( int i = iBegin; i < iEnd; i += grainSize )
             {
                 btAssert( iJob < jobCount );
                 int iE = btMin( i + grainSize, iEnd );
-                JobType& job = jobs[ iJob ];
-                new ( (void*) &job ) ParallelSumJob( i, iE, body, threadLocalSum );  // placement new
-                m_jobContext.submitJob( &job );
+                JobQueue* jq = m_perThreadJobQueues[ iThread ];
+                btAssert(jq);
+                btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues);
+                void* jobMem = jq->allocJobMem(jobSize);
+                JobType* job = new ( jobMem ) ParallelSumJob( i, iE, body, &m_threadLocalStorage[0] );  // placement new
+                jq->submitJob( job );
                 iJob++;
+                iThread++;
+                if ( iThread >= m_numThreads )
+                {
+                    iThread = kFirstWorkerThreadId;  // first worker thread
+                }
             }
             wakeWorkers( jobCount - 1 );
 
@@ -556,7 +755,7 @@ public:
             btScalar sum = btScalar(0);
             for ( int iThread = 0; iThread < m_numThreads; ++iThread )
             {
-                sum += threadLocalSum[ iThread ].mSum;
+                sum += m_threadLocalStorage[ iThread ].m_sumResult;
             }
             m_antiNestingLock.unlock();
             return sum;
@@ -586,4 +785,4 @@ btITaskScheduler* btCreateDefaultTaskScheduler()
     return NULL;
 }
 
-#endif // #else // #if BT_THREADSAFE
\ No newline at end of file
+#endif // #else // #if BT_THREADSAFE
diff --git a/src/LinearMath/TaskScheduler/btThreadSupportInterface.h b/src/LinearMath/TaskScheduler/btThreadSupportInterface.h
index d537d7095..a0ad802b1 100644
--- a/src/LinearMath/TaskScheduler/btThreadSupportInterface.h
+++ b/src/LinearMath/TaskScheduler/btThreadSupportInterface.h
@@ -37,34 +37,29 @@ public:
 
     virtual int getNumWorkerThreads() const = 0;  // number of worker threads (total number of logical processors - 1)
     virtual int getCacheFriendlyNumThreads() const = 0;  // the number of logical processors sharing a single L3 cache
+    virtual int getLogicalToPhysicalCoreRatio() const = 0;  // the number of logical processors per physical processor (usually 1 or 2)
     virtual void runTask( int threadIndex, void* userData ) = 0;
     virtual void waitForAllTasks() = 0;
 
     virtual btCriticalSection* createCriticalSection() = 0;
     virtual void deleteCriticalSection( btCriticalSection* criticalSection ) = 0;
 
-    virtual void* getThreadLocalMemory( int taskId ) { return NULL; }
-
-    typedef void( *ThreadFunc )( void* userPtr, void* lsMemory );
-    typedef void* ( *MemorySetupFunc )( );
+    typedef void( *ThreadFunc )( void* userPtr );
 
     struct ConstructionInfo
     {
         ConstructionInfo( const char* uniqueName,
             ThreadFunc userThreadFunc,
-            MemorySetupFunc	lsMemoryFunc,
             int threadStackSize = 65535
         )
             :m_uniqueName( uniqueName ),
             m_userThreadFunc( userThreadFunc ),
-            m_lsMemoryFunc( lsMemoryFunc ),
             m_threadStackSize( threadStackSize )
         {
         }
 
         const char*     m_uniqueName;
         ThreadFunc      m_userThreadFunc;
-        MemorySetupFunc m_lsMemoryFunc;
         int             m_threadStackSize;
     };
 
diff --git a/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp b/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp
index 5521fc555..ccd7d1e12 100644
--- a/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp
+++ b/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp
@@ -73,7 +73,6 @@ public:
 
         ThreadFunc m_userThreadFunc;
         void* m_userPtr; //for taskDesc etc
-        void* m_lsMemory; //initialized using PosixLocalStoreMemorySetupFunc
 
         pthread_t thread;
         //each tread will wait until this signal to start its work
@@ -103,17 +102,14 @@ public:
     virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; }
     // TODO: return the number of logical processors sharing the first L3 cache
     virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return m_numThreads + 1; }
+    // TODO: detect if CPU has hyperthreading enabled
+    virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return 1; }
 
     virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE;
     virtual void waitForAllTasks() BT_OVERRIDE;
 
     virtual btCriticalSection* createCriticalSection() BT_OVERRIDE;
     virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE;
-
-    virtual void* getThreadLocalMemory( int taskId ) BT_OVERRIDE
-    {
-        return m_activeThreadStatus[ taskId ].m_lsMemory;
-    }
 };
 
 
@@ -190,7 +186,7 @@ static void *threadFunction( void *argument )
         if ( userPtr )
         {
             btAssert( status->m_status );
-            status->m_userThreadFunc( userPtr, status->m_lsMemory );
+            status->m_userThreadFunc( userPtr );
             status->m_status = 2;
             checkPThreadFunction( sem_post( status->m_mainSemaphore ) );
             status->threadUsed++;
@@ -292,7 +288,6 @@ void btThreadSupportPosix::startThreads( const ConstructionInfo& threadConstruct
         threadStatus.m_commandId = 0;
         threadStatus.m_status = 0;
         threadStatus.m_mainSemaphore = m_mainSemaphore;
-        threadStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
         threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
         threadStatus.threadUsed = 0;
 
diff --git a/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp b/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp
index de693590e..00edac650 100644
--- a/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp
+++ b/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp
@@ -179,7 +179,6 @@ public:
 
         ThreadFunc m_userThreadFunc;
         void* m_userPtr; //for taskDesc etc
-        void* m_lsMemory; //initialized using Win32LocalStoreMemorySetupFunc
 
         void* m_threadHandle; //this one is calling 'Win32ThreadFunc'
 
@@ -208,15 +207,11 @@ public:
 
     virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; }
     virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return countSetBits(m_processorInfo.processorTeamMasks[0]); }
+    virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return m_processorInfo.numLogicalProcessors / m_processorInfo.numCores; }
 
     virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE;
     virtual void waitForAllTasks() BT_OVERRIDE;
 
-    virtual void* getThreadLocalMemory( int taskId ) BT_OVERRIDE
-    {
-        return m_activeThreadStatus[ taskId ].m_lsMemory;
-    }
-
     virtual btCriticalSection* createCriticalSection() BT_OVERRIDE;
     virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE;
 };
@@ -246,7 +241,7 @@ DWORD WINAPI win32threadStartFunc( LPVOID lpParam )
         if ( userPtr )
         {
             btAssert( status->m_status );
-            status->m_userThreadFunc( userPtr, status->m_lsMemory );
+            status->m_userThreadFunc( userPtr );
             status->m_status = 2;
             SetEvent( status->m_eventCompleteHandle );
         }
@@ -392,7 +387,6 @@ void btThreadSupportWin32::startThreads( const ConstructionInfo& threadConstruct
         threadStatus.m_commandId = 0;
         threadStatus.m_status = 0;
         threadStatus.m_threadHandle = handle;
-        threadStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
         threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
 
         printf( "started %s thread %d with threadHandle %p\n", threadConstructionInfo.m_uniqueName, i, handle );
@@ -410,9 +404,7 @@ void btThreadSupportWin32::stopThreads()
             WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE );
         }
 
-        delete threadStatus.m_lsMemory;
-
-        threadStatus.m_userPtr = 0;
+        threadStatus.m_userPtr = NULL;
         SetEvent( threadStatus.m_eventStartHandle );
         WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE );