parallel solver: various changes

- threading: adding btSequentialImpulseConstraintSolverMt - task scheduler: added parallelSum so that parallel solver can compute residuals - CommonRigidBodyMTBase: add slider for solver least squares residual and allow multithreading without needing OpenMP, TBB, or PPL - taskScheduler: don't wait for workers to sleep/signal at the end of each parallel block - parallel solver: convertContacts split into an allocContactConstraints and setupContactConstraints stage, the latter of which is done in parallel - parallel solver: rolling friction is now interleaved along with normal friction - parallel solver: batchified split impulse solving + some cleanup - parallel solver: sorting batches from largest to smallest - parallel solver: added parallel batch creation - parallel solver: added warmstartingWriteBackContacts func + other cleanup - task scheduler: truncate low bits to preserve determinism with parallelSum - parallel solver: reducing dynamic mem allocs and trying to parallelize more of the batch setup - parallel solver: parallelize updating constraint batch ids for merging - parallel solver: adding debug visualization - task scheduler: make TBB task scheduler parallelSum deterministic - parallel solver: split batch gen code into separate file; allow selection of batch gen method - task scheduler: add sleepWorkerThreadsHint() at end of simulation - parallel solver: added grain size per phase - task Scheduler: fix for strange threading issue; also no need for main thread to wait for workers to sleep - base constraint solver: break out joint setup into separate function for profiling/overriding - parallel solver: allow different batching method for contacts vs joints - base constraint solver: add convertJoint and convertBodies to make it possible to parallelize joint and body conversion - parallel solver: convert joints and bodies in parallel now - parallel solver: speed up batch creation with run-length encoding - parallel solver: batch gen: run-length expansion in parallel; collect constraint info in parallel - parallel solver: adding spatial grid batching method - parallel solver: enhancements to spatial grid batching - sequential solver: moving code for writing back into functions that derived classes can call - parallel solver: do write back of bodies and joints in parallel - parallel solver: removed all batching methods except for spatial grid (others were ineffective) - parallel solver: added 2D or 3D grid batching options; and a bit of cleanup - move btDefaultTaskScheduler into LinearMath project
2017-06-04 17:57:25 -07:00
parent 94bc897067
commit b8720f2161
25 changed files with 5236 additions and 767 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,14 +28,14 @@ OPTION(USE_GRAPHICAL_BENCHMARK "Use Graphical Benchmark" ON)
 OPTION(BUILD_SHARED_LIBS "Use shared libraries" OFF)
 OPTION(USE_SOFT_BODY_MULTI_BODY_DYNAMICS_WORLD "Use btSoftMultiBodyDynamicsWorld" ON)
-OPTION(BULLET2_USE_THREAD_LOCKS "Build Bullet 2 libraries with mutex locking around certain operations (required for multi-threading)" OFF)
+OPTION(BULLET2_MULTITHREADING "Build Bullet 2 libraries with mutex locking around certain operations (required for multi-threading)" OFF)
-IF (BULLET2_USE_THREAD_LOCKS)
+IF (BULLET2_MULTITHREADING)
    OPTION(BULLET2_USE_OPEN_MP_MULTITHREADING "Build Bullet 2 with support for multi-threading with OpenMP (requires a compiler with OpenMP support)" OFF)
    OPTION(BULLET2_USE_TBB_MULTITHREADING "Build Bullet 2 with support for multi-threading with Intel Threading Building Blocks (requires the TBB library to be already installed)" OFF)
    IF (MSVC)
        OPTION(BULLET2_USE_PPL_MULTITHREADING "Build Bullet 2 with support for multi-threading with Microsoft Parallel Patterns Library (requires MSVC compiler)" OFF)
    ENDIF (MSVC)
-ENDIF (BULLET2_USE_THREAD_LOCKS)
+ENDIF (BULLET2_MULTITHREADING)
 IF(NOT WIN32)
@@ -225,12 +225,15 @@ IF(USE_GRAPHICAL_BENCHMARK)
 ADD_DEFINITIONS( -DUSE_GRAPHICAL_BENCHMARK)
 ENDIF (USE_GRAPHICAL_BENCHMARK)
-IF(BULLET2_USE_THREAD_LOCKS)
+IF(BULLET2_MULTITHREADING)
 	ADD_DEFINITIONS( -DBT_THREADSAFE=1 )
 	IF (NOT MSVC)
 		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 	ENDIF (NOT MSVC)
-ENDIF (BULLET2_USE_THREAD_LOCKS)
+	IF (NOT WIN32)
 		LINK_LIBRARIES( pthread )
 	ENDIF (NOT WIN32)
 ENDIF (BULLET2_MULTITHREADING)
 IF (BULLET2_USE_OPEN_MP_MULTITHREADING)
    ADD_DEFINITIONS("-DBT_USE_OPENMP=1")
--- a/examples/ExampleBrowser/CMakeLists.txt
+++ b/examples/ExampleBrowser/CMakeLists.txt
@@ -226,7 +226,6 @@ SET(BulletExampleBrowser_SRCS
 	../MultiThreading/b3PosixThreadSupport.cpp
 	../MultiThreading/b3Win32ThreadSupport.cpp
 	../MultiThreading/b3ThreadSupportInterface.cpp
 	../MultiThreading/btTaskScheduler.cpp
 	../RenderingExamples/TinyRendererSetup.cpp
 	../RenderingExamples/TimeSeriesCanvas.cpp
 	../RenderingExamples/TimeSeriesCanvas.h
--- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
+++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
@@ -29,17 +29,17 @@ class btCollisionShape;
 #include "BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h"
 #include "BulletDynamics/Dynamics/btSimulationIslandManagerMt.h"  // for setSplitIslands()
 #include "BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h"
 #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h"
 #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h"
 #include "BulletDynamics/ConstraintSolver/btNNCGConstraintSolver.h"
 #include "BulletDynamics/MLCPSolvers/btMLCPSolver.h"
 #include "BulletDynamics/MLCPSolvers/btSolveProjectedGaussSeidel.h"
 #include "BulletDynamics/MLCPSolvers/btDantzigSolver.h"
 #include "BulletDynamics/MLCPSolvers/btLemkeSolver.h"
 #include "../MultiThreading/btTaskScheduler.h"
 static int gNumIslands = 0;
-
+bool gAllowNestedParallelForLoops = false;
 class Profiler
 {
@@ -52,6 +52,10 @@ public:
        kRecordPredictUnconstrainedMotion,
        kRecordCreatePredictiveContacts,
        kRecordIntegrateTransforms,
        kRecordSolverTotal,
        kRecordSolverSetup,
        kRecordSolverIterations,
        kRecordSolverFinish,
        kRecordCount
    };
@@ -139,6 +143,41 @@ static void profileEndCallback( btDynamicsWorld *world, btScalar timeStep )
 }
 class MySequentialImpulseConstraintSolverMt : public btSequentialImpulseConstraintSolverMt
 {
    typedef btSequentialImpulseConstraintSolverMt ParentClass;
 public:
    BT_DECLARE_ALIGNED_ALLOCATOR();
 	MySequentialImpulseConstraintSolverMt() {}
    // for profiling
 	virtual btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE
    {
        ProfileHelper prof(Profiler::kRecordSolverSetup);
        btScalar ret = ParentClass::solveGroupCacheFriendlySetup(bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer );
        return ret;
    }
    virtual btScalar solveGroupCacheFriendlyIterations( btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& infoGlobal, btIDebugDraw* debugDrawer ) BT_OVERRIDE
    {
        ProfileHelper prof(Profiler::kRecordSolverIterations);
        btScalar ret = ParentClass::solveGroupCacheFriendlyIterations(bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer );
        return ret;
    }
    virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal) BT_OVERRIDE
    {
        ProfileHelper prof(Profiler::kRecordSolverFinish);
        btScalar ret = ParentClass::solveGroupCacheFriendlyFinish(bodies, numBodies, infoGlobal);
        return ret;
    }
    virtual btScalar solveGroup(btCollisionObject** bodies, int numBodies, btPersistentManifold** manifold, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btDispatcher* dispatcher) BT_OVERRIDE
    {
        ProfileHelper prof(Profiler::kRecordSolverTotal);
        btScalar ret = ParentClass::solveGroup(bodies, numBodies, manifold, numManifolds, constraints, numConstraints, info, debugDrawer, dispatcher);
        return ret;
    }
 };
 ///
 /// MyCollisionDispatcher -- subclassed for profiling purposes
 ///
@@ -218,6 +257,8 @@ btConstraintSolver* createSolverByType( SolverType t )
    {
    case SOLVER_TYPE_SEQUENTIAL_IMPULSE:
        return new btSequentialImpulseConstraintSolver();
    case SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT:
        return new MySequentialImpulseConstraintSolverMt();
    case SOLVER_TYPE_NNCG:
        return new btNNCGConstraintSolver();
    case SOLVER_TYPE_MLCP_PGS:
@@ -253,7 +294,7 @@ public:
    {
        addTaskScheduler( btGetSequentialTaskScheduler() );
 #if BT_THREADSAFE
-        if ( btITaskScheduler* ts = createDefaultTaskScheduler() )
+        if ( btITaskScheduler* ts = btCreateDefaultTaskScheduler() )
        {
            m_allocatedTaskSchedulers.push_back( ts );
            addTaskScheduler( ts );
@@ -310,7 +351,7 @@ static bool gDisplayProfileInfo = true;
 static bool gMultithreadedWorld = false;
 static bool gDisplayProfileInfo = false;
 #endif
-static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE;
+static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT;
 static int gSolverMode = SOLVER_SIMD |
                        SOLVER_USE_WARMSTARTING |
                        // SOLVER_RANDMIZE_ORDER |
@@ -318,9 +359,11 @@ static int gSolverMode = SOLVER_SIMD |
                        // SOLVER_USE_2_FRICTION_DIRECTIONS |
                        0;
 static btScalar gSliderSolverIterations = 10.0f; // should be int
 static btScalar gSliderNumThreads = 1.0f;  // should be int
-
+static btScalar gSliderIslandBatchingThreshold = 0.0f; // should be int
 static btScalar gSliderMinBatchSize = btScalar(btSequentialImpulseConstraintSolverMt::s_minBatchSize); // should be int
 static btScalar gSliderMaxBatchSize = btScalar(btSequentialImpulseConstraintSolverMt::s_maxBatchSize); // should be int
 static btScalar gSliderLeastSquaresResidualThreshold = 0.0f;
 ////////////////////////////////////
 CommonRigidBodyMTBase::CommonRigidBodyMTBase( struct GUIHelperInterface* helper )
@@ -419,6 +462,23 @@ void setTaskSchedulerComboBoxCallback(int combobox, const char* item, void* user
 }
 void setBatchingMethodComboBoxCallback(int combobox, const char* item, void* userPointer)
 {
 #if BT_THREADSAFE
    const char** items = static_cast<const char**>( userPointer );
    for ( int i = 0; i < btBatchedConstraints::BATCHING_METHOD_COUNT; ++i )
    {
        if ( strcmp( item, items[ i ] ) == 0 )
        {
            // change the task scheduler
            btSequentialImpulseConstraintSolverMt::s_contactBatchingMethod = static_cast<btBatchedConstraints::BatchingMethod>( i );
            break;
        }
    }
 #endif // #if BT_THREADSAFE
 }
 static void setThreadCountCallback(float val, void* userPtr)
 {
 #if BT_THREADSAFE
@@ -435,13 +495,43 @@ static void setSolverIterationCountCallback(float val, void* userPtr)
    }
 }
 static void setLargeIslandManifoldCountCallback( float val, void* userPtr )
 {
    btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching = int( gSliderIslandBatchingThreshold );
 }
 static void setMinBatchSizeCallback( float val, void* userPtr )
 {
    gSliderMaxBatchSize = (std::max)(gSliderMinBatchSize, gSliderMaxBatchSize);
    btSequentialImpulseConstraintSolverMt::s_minBatchSize = int(gSliderMinBatchSize);
    btSequentialImpulseConstraintSolverMt::s_maxBatchSize = int(gSliderMaxBatchSize);
 }
 static void setMaxBatchSizeCallback( float val, void* userPtr )
 {
    gSliderMinBatchSize = (std::min)(gSliderMinBatchSize, gSliderMaxBatchSize);
    btSequentialImpulseConstraintSolverMt::s_minBatchSize = int(gSliderMinBatchSize);
    btSequentialImpulseConstraintSolverMt::s_maxBatchSize = int(gSliderMaxBatchSize);
 }
 static void setLeastSquaresResidualThresholdCallback( float val, void* userPtr )
 {
    if (btDiscreteDynamicsWorld* world = reinterpret_cast<btDiscreteDynamicsWorld*>(userPtr))
    {
        world->getSolverInfo().m_leastSquaresResidualThreshold = gSliderLeastSquaresResidualThreshold;
    }
 }
 void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
 {
    gNumIslands = 0;
    m_solverType = gSolverType;
-#if BT_THREADSAFE && (BT_USE_OPENMP || BT_USE_PPL || BT_USE_TBB)
+#if BT_THREADSAFE
    btAssert( btGetTaskScheduler() != NULL );
-    m_multithreadCapable = true;
+    if (NULL != btGetTaskScheduler() && gTaskSchedulerMgr.getNumTaskSchedulers() > 1)
    {
        m_multithreadCapable = true;
    }
 #endif
    if ( gMultithreadedWorld )
    {
@@ -486,7 +576,12 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
        m_broadphase = new btDbvtBroadphase();
-        m_solver = createSolverByType( m_solverType );
+        SolverType solverType = m_solverType;
        if ( solverType == SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT )
        {
            solverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE;
        }
        m_solver = createSolverByType( solverType );
        m_dynamicsWorld = new btDiscreteDynamicsWorld( m_dispatcher, m_broadphase, m_solver, m_collisionConfiguration );
    }
@@ -494,6 +589,7 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
    m_dynamicsWorld->setInternalTickCallback( profileEndCallback, NULL, false );
    m_dynamicsWorld->setGravity( btVector3( 0, -10, 0 ) );
    m_dynamicsWorld->getSolverInfo().m_solverMode = gSolverMode;
   	m_dynamicsWorld->getSolverInfo().m_numIterations = btMax(1, int(gSliderSolverIterations));
    createDefaultParameters();
 }
@@ -504,16 +600,18 @@ void CommonRigidBodyMTBase::createDefaultParameters()
    {
        // create a button to toggle multithreaded world
        ButtonParams button( "Multithreaded world enable", 0, true );
-        button.m_initialState = gMultithreadedWorld;
+        bool* ptr = &gMultithreadedWorld;
-        button.m_userPointer = &gMultithreadedWorld;
+        button.m_initialState = *ptr;
        button.m_userPointer = ptr;
        button.m_callback = boolPtrButtonCallback;
        m_guiHelper->getParameterInterface()->registerButtonParameter( button );
    }
    {
        // create a button to toggle profile printing
        ButtonParams button( "Display solver info", 0, true );
-        button.m_initialState = gDisplayProfileInfo;
+        bool* ptr = &gDisplayProfileInfo;
-        button.m_userPointer = &gDisplayProfileInfo;
+        button.m_initialState = *ptr;
        button.m_userPointer = ptr;
        button.m_callback = boolPtrButtonCallback;
        m_guiHelper->getParameterInterface()->registerButtonParameter( button );
    }
@@ -544,6 +642,16 @@ void CommonRigidBodyMTBase::createDefaultParameters()
        slider.m_clampToIntegers = true;
        m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
    }
    {
        // a slider for the solver leastSquaresResidualThreshold (used to run fewer solver iterations when convergence is good)
        SliderParams slider( "Solver residual thresh", &gSliderLeastSquaresResidualThreshold );
        slider.m_minVal = 0.0f;
        slider.m_maxVal = 0.25f;
        slider.m_callback = setLeastSquaresResidualThresholdCallback;
        slider.m_userPointer = m_dynamicsWorld;
        slider.m_clampToIntegers = false;
        m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
    }
    {
        ButtonParams button( "Solver use SIMD", 0, true );
        button.m_buttonId = SOLVER_SIMD;
@@ -618,20 +726,86 @@ void CommonRigidBodyMTBase::createDefaultParameters()
            m_guiHelper->getParameterInterface()->registerComboBox( comboParams );
        }
        {
            // create a slider to set the number of threads to use
            int numThreads = btGetTaskScheduler()->getNumThreads();
            // if slider has not been set yet (by another demo),
            if ( gSliderNumThreads <= 1.0f )
            {
                // create a slider to set the number of threads to use
                int numThreads = btGetTaskScheduler()->getNumThreads();
                gSliderNumThreads = float( numThreads );
            }
            int maxNumThreads = btGetTaskScheduler()->getMaxNumThreads();
 			SliderParams slider("Thread count", &gSliderNumThreads);
 			slider.m_minVal = 1.0f;
-			slider.m_maxVal = float( BT_MAX_THREAD_COUNT );
+			slider.m_maxVal = float( maxNumThreads );
 			slider.m_callback = setThreadCountCallback;
            slider.m_clampToIntegers = true;
            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
        }
        {
            // a slider for the number of manifolds an island needs to be too large for parallel dispatch
            if (gSliderIslandBatchingThreshold < 1.0)
            {
                gSliderIslandBatchingThreshold = float(btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching);
            }
            SliderParams slider( "IslandBatchThresh", &gSliderIslandBatchingThreshold );
            slider.m_minVal = 1.0f;
            slider.m_maxVal = 2000.0f;
            slider.m_callback = setLargeIslandManifoldCountCallback;
            slider.m_userPointer = NULL;
            slider.m_clampToIntegers = true;
            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
        }
        {
            // create a combo box for selecting the batching method
            static const char* sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_COUNT ];
            {
                sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_SPATIAL_GRID_2D ] = "Batching: 2D Grid";
                sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_SPATIAL_GRID_3D ] = "Batching: 3D Grid";
            };
            ComboBoxParams comboParams;
            comboParams.m_userPointer = sBatchingMethodComboBoxItems;
            comboParams.m_numItems = btBatchedConstraints::BATCHING_METHOD_COUNT;
            comboParams.m_startItem = static_cast<int>(btSequentialImpulseConstraintSolverMt::s_contactBatchingMethod);
            comboParams.m_items = sBatchingMethodComboBoxItems;
            comboParams.m_callback = setBatchingMethodComboBoxCallback;
            m_guiHelper->getParameterInterface()->registerComboBox( comboParams );
        }
        {
            // a slider for the sequentialImpulseConstraintSolverMt min batch size (when batching)
            SliderParams slider( "Min batch size", &gSliderMinBatchSize );
            slider.m_minVal = 1.0f;
            slider.m_maxVal = 1000.0f;
            slider.m_callback = setMinBatchSizeCallback;
            slider.m_userPointer = NULL;
            slider.m_clampToIntegers = true;
            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
        }
        {
            // a slider for the sequentialImpulseConstraintSolverMt max batch size (when batching)
            SliderParams slider( "Max batch size", &gSliderMaxBatchSize );
            slider.m_minVal = 1.0f;
            slider.m_maxVal = 1000.0f;
            slider.m_callback = setMaxBatchSizeCallback;
            slider.m_userPointer = NULL;
            slider.m_clampToIntegers = true;
            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
        }
        {
            // create a button to toggle debug drawing of batching visualization
            ButtonParams button( "Visualize batching", 0, true );
            bool* ptr = &btBatchedConstraints::s_debugDrawBatches;
            button.m_initialState = *ptr;
            button.m_userPointer = ptr;
            button.m_callback = boolPtrButtonCallback;
            m_guiHelper->getParameterInterface()->registerButtonParameter( button );
        }
        {
            ButtonParams button( "Allow Nested ParallelFor", 0, true );
            button.m_initialState = btSequentialImpulseConstraintSolverMt::s_allowNestedParallelForLoops;
            button.m_userPointer = &btSequentialImpulseConstraintSolverMt::s_allowNestedParallelForLoops;
            button.m_callback = boolPtrButtonCallback;
            m_guiHelper->getParameterInterface()->registerButtonParameter( button );
        }
 #endif // #if BT_THREADSAFE
    }
 }
@@ -643,6 +817,7 @@ void CommonRigidBodyMTBase::drawScreenText()
    int xCoord = 400;
    int yCoord = 30;
    int yStep = 30;
    int indent = 30;
    if (m_solverType != gSolverType)
    {
        sprintf( msg, "restart example to change solver type" );
@@ -721,6 +896,34 @@ void CommonRigidBodyMTBase::drawScreenText()
            m_guiHelper->getAppInterface()->drawText( msg, xCoord, yCoord, 0.4f );
            yCoord += yStep;
            sprintf( msg,
                     "SolverTotal %5.3f ms",
                     gProfiler.getAverageTime( Profiler::kRecordSolverTotal )*0.001f
                     );
            m_guiHelper->getAppInterface()->drawText( msg, xCoord, yCoord, 0.4f );
            yCoord += yStep;
            sprintf( msg,
                     "SolverSetup %5.3f ms",
                     gProfiler.getAverageTime( Profiler::kRecordSolverSetup )*0.001f
                     );
            m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f );
            yCoord += yStep;
            sprintf( msg,
                     "SolverIterations %5.3f ms",
                     gProfiler.getAverageTime( Profiler::kRecordSolverIterations )*0.001f
                     );
            m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f );
            yCoord += yStep;
            sprintf( msg,
                     "SolverFinish %5.3f ms",
                     gProfiler.getAverageTime( Profiler::kRecordSolverFinish )*0.001f
                     );
            m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f );
            yCoord += yStep;
            sprintf( msg,
                     "PredictUnconstrainedMotion %5.3f ms",
                     gProfiler.getAverageTime( Profiler::kRecordPredictUnconstrainedMotion )*0.001f
--- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.h
+++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.h
@@ -14,6 +14,7 @@
 enum SolverType
 {
    SOLVER_TYPE_SEQUENTIAL_IMPULSE,
    SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT,
    SOLVER_TYPE_NNCG,
    SOLVER_TYPE_MLCP_PGS,
    SOLVER_TYPE_MLCP_DANTZIG,
@@ -27,6 +28,7 @@ inline const char* getSolverTypeName( SolverType t )
    switch (t)
    {
    case SOLVER_TYPE_SEQUENTIAL_IMPULSE: return "SequentialImpulse";
    case SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT: return "SequentialImpulseMt";
    case SOLVER_TYPE_NNCG: return "NNCG";
    case SOLVER_TYPE_MLCP_PGS: return "MLCP ProjectedGaussSeidel";
    case SOLVER_TYPE_MLCP_DANTZIG: return "MLCP Dantzig";
--- a/examples/MultiThreadedDemo/MultiThreadedDemo.cpp
+++ b/examples/MultiThreadedDemo/MultiThreadedDemo.cpp
@@ -25,10 +25,10 @@ subject to the following restrictions:
-static btScalar gSliderStackRows = 8.0f;
+static btScalar gSliderStackRows = 1.0f;
-static btScalar gSliderStackColumns = 6.0f;
+static btScalar gSliderStackColumns = 1.0f;
-static btScalar gSliderStackHeight = 10.0f;
+static btScalar gSliderStackHeight = 15.0f;
-static btScalar gSliderStackWidth = 1.0f;
+static btScalar gSliderStackWidth = 8.0f;
 static btScalar gSliderGroundHorizontalAmplitude = 0.0f;
 static btScalar gSliderGroundVerticalAmplitude = 0.0f;
 static btScalar gSliderGroundTilt = 0.0f;
@@ -75,6 +75,21 @@ public:
        btScalar tilt = gSliderGroundTilt * SIMD_2_PI / 360.0f;
        return btQuaternion( btVector3( 1.0f, 0.0f, 0.0f ), tilt );
    }
    struct TestSumBody : public btIParallelSumBody
    {
        virtual btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE
        {
            btScalar sum = 0.0f;
            for (int i = iBegin; i < iEnd; ++i)
            {
                if (i > 0)
                {
                    sum += 1.0f / btScalar(i);
                }
            }
            return sum;
        }
    };
    virtual void stepSimulation( float deltaTime ) BT_OVERRIDE
    {
        if ( m_dynamicsWorld )
@@ -115,6 +130,14 @@ public:
            // always step by 1/60 for benchmarking
            m_dynamicsWorld->stepSimulation( 1.0f / 60.0f, 0 );
        }
 #if 0
        {
            // test parallelSum
            TestSumBody testSumBody;
            float testSum = btParallelSum( 1, 10000000, 10000, testSumBody );
            printf( "sum = %f\n", testSum );
        }
 #endif
    }
    virtual void initPhysics() BT_OVERRIDE;
--- a/examples/MultiThreading/btTaskScheduler.cpp
+++ b/examples/MultiThreading/btTaskScheduler.cpp
@@ -1,448 +0,0 @@
 #include "LinearMath/btTransform.h"
 #include "../Utils/b3Clock.h"
 #include "LinearMath/btAlignedObjectArray.h"
 #include "LinearMath/btThreads.h"
 #include "LinearMath/btQuickprof.h"
 #include <stdio.h>
 #include <algorithm>
 typedef void( *btThreadFunc )( void* userPtr, void* lsMemory );
 typedef void* ( *btThreadLocalStorageFunc )();
 #if BT_THREADSAFE
 #if defined( _WIN32 )
 #include "b3Win32ThreadSupport.h"
 b3ThreadSupportInterface* createThreadSupport( int numThreads, btThreadFunc threadFunc, btThreadLocalStorageFunc localStoreFunc, const char* uniqueName )
 {
    b3Win32ThreadSupport::Win32ThreadConstructionInfo constructionInfo( uniqueName, threadFunc, localStoreFunc, numThreads );
    //constructionInfo.m_priority = 0;  // highest priority (the default) -- can cause erratic performance when numThreads > numCores
    //                                     we don't want worker threads to be higher priority than the main thread or the main thread could get
    //                                     totally shut out and unable to tell the workers to stop
    constructionInfo.m_priority = -1;  // normal priority
    b3Win32ThreadSupport* threadSupport = new b3Win32ThreadSupport( constructionInfo );
    return threadSupport;
 }
 #else // #if defined( _WIN32 )
 #include "b3PosixThreadSupport.h"
 b3ThreadSupportInterface* createThreadSupport( int numThreads, btThreadFunc threadFunc, btThreadLocalStorageFunc localStoreFunc, const char* uniqueName)
 {
    b3PosixThreadSupport::ThreadConstructionInfo constructionInfo( uniqueName, threadFunc, localStoreFunc, numThreads );
    b3ThreadSupportInterface* threadSupport = new b3PosixThreadSupport( constructionInfo );
    return threadSupport;
 }
 #endif // #else // #if defined( _WIN32 )
 ///
 /// getNumHardwareThreads()
 ///
 ///
 /// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
 ///
 #if __cplusplus >= 201103L
 #include <thread>
 int getNumHardwareThreads()
 {
    return std::thread::hardware_concurrency();
 }
 #elif defined( _WIN32 )
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 int getNumHardwareThreads()
 {
    // caps out at 32
    SYSTEM_INFO info;
    GetSystemInfo( &info );
    return info.dwNumberOfProcessors;
 }
 #else
 int getNumHardwareThreads()
 {
    return 0;  // don't know
 }
 #endif
 struct WorkerThreadStatus
 {
    enum Type
    {
        kInvalid,
        kWaitingForWork,
        kWorking,
        kSleeping,
    };
 };
 struct IJob
 {
    virtual void executeJob() = 0;
 };
 class ParallelForJob : public IJob
 {
    const btIParallelForBody* mBody;
    int mBegin;
    int mEnd;
 public:
    ParallelForJob()
    {
        mBody = NULL;
        mBegin = 0;
        mEnd = 0;
    }
    void init( int iBegin, int iEnd, const btIParallelForBody& body )
    {
        mBody = &body;
        mBegin = iBegin;
        mEnd = iEnd;
    }
    virtual void executeJob() BT_OVERRIDE
    {
        BT_PROFILE( "executeJob" );
        // call the functor body to do the work
        mBody->forLoop( mBegin, mEnd );
    }
 };
 struct JobContext
 {
    JobContext()
    {
        m_queueLock = NULL;
        m_headIndex = 0;
        m_tailIndex = 0;
        m_workersShouldCheckQueue = false;
        m_useSpinMutex = false;
    }
    b3CriticalSection* m_queueLock;
    btSpinMutex m_mutex;
    volatile bool m_workersShouldCheckQueue;
    btAlignedObjectArray<IJob*> m_jobQueue;
    bool m_queueIsEmpty;
    int m_tailIndex;
    int m_headIndex;
    bool m_useSpinMutex;
    void lockQueue()
    {
        if ( m_useSpinMutex )
        {
            m_mutex.lock();
        }
        else
        {
            m_queueLock->lock();
        }
    }
    void unlockQueue()
    {
        if ( m_useSpinMutex )
        {
            m_mutex.unlock();
        }
        else
        {
            m_queueLock->unlock();
        }
    }
    void clearQueue()
    {
        lockQueue();
        m_headIndex = 0;
        m_tailIndex = 0;
        m_queueIsEmpty = true;
        unlockQueue();
        m_jobQueue.resizeNoInitialize( 0 );
    }
    void submitJob( IJob* job )
    {
        m_jobQueue.push_back( job );
        lockQueue();
        m_tailIndex++;
        m_queueIsEmpty = false;
        unlockQueue();
    }
    IJob* consumeJob()
    {
        if ( m_queueIsEmpty )
        {
            // lock free path. even if this is taken erroneously it isn't harmful
            return NULL;
        }
        IJob* job = NULL;
        lockQueue();
        if ( !m_queueIsEmpty )
        {
            job = m_jobQueue[ m_headIndex++ ];
            if ( m_headIndex == m_tailIndex )
            {
                m_queueIsEmpty = true;
            }
        }
        unlockQueue();
        return job;
    }
 };
 struct WorkerThreadLocalStorage
 {
    int threadId;
    WorkerThreadStatus::Type status;
 };
 static void WorkerThreadFunc( void* userPtr, void* lsMemory )
 {
    BT_PROFILE( "WorkerThreadFunc" );
    WorkerThreadLocalStorage* localStorage = (WorkerThreadLocalStorage*) lsMemory;
    localStorage->status = WorkerThreadStatus::kWaitingForWork;
    //printf( "WorkerThreadFunc: worker %d start working\n", localStorage->threadId );
    JobContext* jobContext = (JobContext*) userPtr;
    while ( jobContext->m_workersShouldCheckQueue )
    {
        if ( IJob* job = jobContext->consumeJob() )
        {
            localStorage->status = WorkerThreadStatus::kWorking;
            job->executeJob();
            localStorage->status = WorkerThreadStatus::kWaitingForWork;
        }
        else
        {
            // todo: spin wait a bit to avoid hammering the empty queue
        }
    }
    //printf( "WorkerThreadFunc stop working\n" );
    localStorage->status = WorkerThreadStatus::kSleeping;
    // go idle
 }
 static void* WorkerThreadAllocFunc()
 {
    return new WorkerThreadLocalStorage;
 }
 class btTaskSchedulerDefault : public btITaskScheduler
 {
    JobContext m_jobContext;
    b3ThreadSupportInterface* m_threadSupport;
    btAlignedObjectArray<ParallelForJob> m_jobs;
    btSpinMutex m_antiNestingLock;  // prevent nested parallel-for
    int m_numThreads;
    int m_numWorkerThreads;
    int m_numWorkersRunning;
 public:
    btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport")
    {
        m_threadSupport = NULL;
        m_numThreads = getNumHardwareThreads();
        // if can't detect number of cores,
        if ( m_numThreads == 0 )
        {
            // take a guess
            m_numThreads = 4;
        }
        m_numWorkerThreads = m_numThreads - 1;
        m_numWorkersRunning = 0;
    }
    virtual ~btTaskSchedulerDefault()
    {
        shutdown();
    }
    void init()
    {
        int maxNumWorkerThreads = BT_MAX_THREAD_COUNT - 1;
        m_threadSupport = createThreadSupport( maxNumWorkerThreads, WorkerThreadFunc, WorkerThreadAllocFunc, "TaskScheduler" );
        m_jobContext.m_queueLock = m_threadSupport->createCriticalSection();
        for ( int i = 0; i < maxNumWorkerThreads; i++ )
        {
            WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i );
            btAssert( storage );
            storage->threadId = i;
            storage->status = WorkerThreadStatus::kSleeping;
        }
        setWorkersActive( false ); // no work for them yet
    }
    virtual void shutdown()
    {
        setWorkersActive( false );
        waitForWorkersToSleep();
        m_threadSupport->deleteCriticalSection( m_jobContext.m_queueLock );
        m_jobContext.m_queueLock = NULL;
        delete m_threadSupport;
        m_threadSupport = NULL;
    }
    void setWorkersActive( bool active )
    {
        m_jobContext.m_workersShouldCheckQueue = active;
    }
    virtual int getMaxNumThreads() const BT_OVERRIDE
    {
        return BT_MAX_THREAD_COUNT;
    }
    virtual int getNumThreads() const BT_OVERRIDE
    {
        return m_numThreads;
    }
    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
    {
        m_numThreads = btMax( btMin(numThreads, int(BT_MAX_THREAD_COUNT)), 1 );
        m_numWorkerThreads = m_numThreads - 1;
    }
    void waitJobs()
    {
        BT_PROFILE( "waitJobs" );
        // have the main thread work until the job queue is empty
        for ( ;; )
        {
            if ( IJob* job = m_jobContext.consumeJob() )
            {
                job->executeJob();
            }
            else
            {
                break;
            }
        }
        // done with jobs for now, tell workers to rest
        setWorkersActive( false );
        waitForWorkersToSleep();
    }
    void wakeWorkers()
    {
        BT_PROFILE( "wakeWorkers" );
        btAssert( m_jobContext.m_workersShouldCheckQueue );
        // tell each worker thread to start working
        for ( int i = 0; i < m_numWorkerThreads; i++ )
        {
            m_threadSupport->runTask( B3_THREAD_SCHEDULE_TASK, &m_jobContext, i );
            m_numWorkersRunning++;
        }
    }
    void waitForWorkersToSleep()
    {
        BT_PROFILE( "waitForWorkersToSleep" );
        while ( m_numWorkersRunning > 0 )
        {
            int iThread;
            int threadStatus;
            m_threadSupport->waitForResponse( &iThread, &threadStatus );  // wait for worker threads to finish working
            m_numWorkersRunning--;
        }
        //m_threadSupport->waitForAllTasksToComplete();
        for ( int i = 0; i < m_numWorkerThreads; i++ )
        {
            //m_threadSupport->waitForTaskCompleted( i );
            WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i );
            btAssert( storage );
            btAssert( storage->status == WorkerThreadStatus::kSleeping );
        }
    }
    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
    {
        BT_PROFILE( "parallelFor_ThreadSupport" );
        btAssert( iEnd >= iBegin );
        btAssert( grainSize >= 1 );
        int iterationCount = iEnd - iBegin;
        if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() )
        {
            int jobCount = ( iterationCount + grainSize - 1 ) / grainSize;
            btAssert( jobCount >= 2 );  // need more than one job for multithreading
            if ( jobCount > m_jobs.size() )
            {
                m_jobs.resize( jobCount );
            }
            if ( jobCount > m_jobContext.m_jobQueue.capacity() )
            {
                m_jobContext.m_jobQueue.reserve( jobCount );
            }
            m_jobContext.clearQueue();
            // prepare worker threads for incoming work
            setWorkersActive( true );
            wakeWorkers();
            // submit all of the jobs
            int iJob = 0;
            for ( int i = iBegin; i < iEnd; i += grainSize )
            {
                btAssert( iJob < jobCount );
                int iE = btMin( i + grainSize, iEnd );
                ParallelForJob& job = m_jobs[ iJob ];
                job.init( i, iE, body );
                m_jobContext.submitJob( &job );
                iJob++;
            }
            // put the main thread to work on emptying the job queue and then wait for all workers to finish
            waitJobs();
            m_antiNestingLock.unlock();
        }
        else
        {
            BT_PROFILE( "parallelFor_mainThread" );
            // just run on main thread
            body.forLoop( iBegin, iEnd );
        }
    }
 };
 btITaskScheduler* createDefaultTaskScheduler()
 {
    btTaskSchedulerDefault* ts = new btTaskSchedulerDefault();
    ts->init();
    return ts;
 }
 #else // #if BT_THREADSAFE
 btITaskScheduler* createDefaultTaskScheduler()
 {
    return NULL;
 }
 #endif // #else // #if BT_THREADSAFE
--- a/examples/MultiThreading/btTaskScheduler.h
+++ b/examples/MultiThreading/btTaskScheduler.h
@@ -1,26 +0,0 @@
 /*
 Copyright (c) 2003-2014 Erwin Coumans  http://bullet.googlecode.com
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose,
 including commercial applications, and to alter it and redistribute it freely,
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #ifndef BT_TASK_SCHEDULER_H
 #define BT_TASK_SCHEDULER_H
 class btITaskScheduler;
 btITaskScheduler* createDefaultTaskScheduler();
 #endif // BT_TASK_SCHEDULER_H
--- a/src/BulletDynamics/CMakeLists.txt
+++ b/src/BulletDynamics/CMakeLists.txt
@@ -15,6 +15,8 @@ SET(BulletDynamics_SRCS
 	ConstraintSolver/btHingeConstraint.cpp
 	ConstraintSolver/btPoint2PointConstraint.cpp
 	ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
 	ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
 	ConstraintSolver/btBatchedConstraints.cpp
 	ConstraintSolver/btNNCGConstraintSolver.cpp
 	ConstraintSolver/btSliderConstraint.cpp
 	ConstraintSolver/btSolve2LinearConstraint.cpp
@@ -62,6 +64,7 @@ SET(ConstraintSolver_HDRS
 	ConstraintSolver/btJacobianEntry.h
 	ConstraintSolver/btPoint2PointConstraint.h
 	ConstraintSolver/btSequentialImpulseConstraintSolver.h
 	ConstraintSolver/btSequentialImpulseConstraintSolverMt.h
 	ConstraintSolver/btNNCGConstraintSolver.h
 	ConstraintSolver/btSliderConstraint.h
 	ConstraintSolver/btSolve2LinearConstraint.h
--- a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
+++ b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
--- a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.h
+++ b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.h
@@ -0,0 +1,66 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #ifndef BT_BATCHED_CONSTRAINTS_H
 #define BT_BATCHED_CONSTRAINTS_H
 #include "LinearMath/btThreads.h"
 #include "LinearMath/btAlignedObjectArray.h"
 #include "BulletDynamics/ConstraintSolver/btSolverBody.h"
 #include "BulletDynamics/ConstraintSolver/btSolverConstraint.h"
 class btIDebugDraw;
 struct btBatchedConstraints
 {
    enum BatchingMethod
    {
        BATCHING_METHOD_SPATIAL_GRID_2D,
        BATCHING_METHOD_SPATIAL_GRID_3D,
        BATCHING_METHOD_COUNT
    };
    struct Range
    {
        int begin;
        int end;
        Range() : begin( 0 ), end( 0 ) {}
        Range( int _beg, int _end ) : begin( _beg ), end( _end ) {}
    };
    btAlignedObjectArray<int> m_constraintIndices;
    btAlignedObjectArray<Range> m_batches;  // each batch is a range of indices in the m_constraintIndices array
    btAlignedObjectArray<Range> m_phases;  // each phase is range of indices in the m_batches array
    btAlignedObjectArray<char> m_phaseGrainSize;  // max grain size for each phase
    btAlignedObjectArray<int> m_phaseOrder;  // phases can be done in any order, so we can randomize the order here
    btIDebugDraw* m_debugDrawer;
    static bool s_debugDrawBatches;
    btBatchedConstraints() {m_debugDrawer=NULL;}
    void setup( btConstraintArray* constraints,
        const btAlignedObjectArray<btSolverBody>& bodies,
        BatchingMethod batchingMethod,
        int minBatchSize,
        int maxBatchSize,
        btAlignedObjectArray<char>* scratchMemory
    );
    bool validate( btConstraintArray* constraints, const btAlignedObjectArray<btSolverBody>& bodies ) const;
 };
 #endif // BT_BATCHED_CONSTRAINTS_H
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp
@@ -1258,6 +1258,256 @@ void btSequentialImpulseConstraintSolver::convertContacts(btPersistentManifold**
 	}
 }
 void btSequentialImpulseConstraintSolver::convertJoint(btSolverConstraint* currentConstraintRow,
    btTypedConstraint* constraint,
    const btTypedConstraint::btConstraintInfo1& info1,
    int solverBodyIdA,
    int solverBodyIdB,
    const btContactSolverInfo& infoGlobal
    )
 {
 	const btRigidBody& rbA = constraint->getRigidBodyA();
 	const btRigidBody& rbB = constraint->getRigidBodyB();
    const btSolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA];
    const btSolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB];
 	int overrideNumSolverIterations = constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;
 	if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)
 		m_maxOverrideNumSolverIterations = overrideNumSolverIterations;
 	for (int j=0;j<info1.m_numConstraintRows;j++)
 	{
 		memset(&currentConstraintRow[j],0,sizeof(btSolverConstraint));
 		currentConstraintRow[j].m_lowerLimit = -SIMD_INFINITY;
 		currentConstraintRow[j].m_upperLimit = SIMD_INFINITY;
 		currentConstraintRow[j].m_appliedImpulse = 0.f;
 		currentConstraintRow[j].m_appliedPushImpulse = 0.f;
 		currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;
 		currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;
 		currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;
 	}
    // these vectors are already cleared in initSolverBody, no need to redundantly clear again
    btAssert(bodyAPtr->getDeltaLinearVelocity().isZero());
    btAssert(bodyAPtr->getDeltaAngularVelocity().isZero());
    btAssert(bodyAPtr->getPushVelocity().isZero());
    btAssert(bodyAPtr->getTurnVelocity().isZero());
    btAssert(bodyBPtr->getDeltaLinearVelocity().isZero());
    btAssert(bodyBPtr->getDeltaAngularVelocity().isZero());
    btAssert(bodyBPtr->getPushVelocity().isZero());
    btAssert(bodyBPtr->getTurnVelocity().isZero());
 	//bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
 	//bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
 	//bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
 	//bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
 	//bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
 	//bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
 	//bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
 	//bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
 	btTypedConstraint::btConstraintInfo2 info2;
 	info2.fps = 1.f/infoGlobal.m_timeStep;
 	info2.erp = infoGlobal.m_erp;
 	info2.m_J1linearAxis = currentConstraintRow->m_contactNormal1;
 	info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal;
 	info2.m_J2linearAxis = currentConstraintRow->m_contactNormal2;
 	info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal;
 	info2.rowskip = sizeof(btSolverConstraint)/sizeof(btScalar);//check this
 	///the size of btSolverConstraint needs be a multiple of btScalar
 	btAssert(info2.rowskip*sizeof(btScalar)== sizeof(btSolverConstraint));
 	info2.m_constraintError = &currentConstraintRow->m_rhs;
 	currentConstraintRow->m_cfm = infoGlobal.m_globalCfm;
 	info2.m_damping = infoGlobal.m_damping;
 	info2.cfm = &currentConstraintRow->m_cfm;
 	info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;
 	info2.m_upperLimit = &currentConstraintRow->m_upperLimit;
 	info2.m_numIterations = infoGlobal.m_numIterations;
 	constraint->getInfo2(&info2);
 	///finalize the constraint setup
 	for (int j=0;j<info1.m_numConstraintRows;j++)
 	{
 		btSolverConstraint& solverConstraint = currentConstraintRow[j];
 		if (solverConstraint.m_upperLimit>=constraint->getBreakingImpulseThreshold())
 		{
 			solverConstraint.m_upperLimit = constraint->getBreakingImpulseThreshold();
 		}
 		if (solverConstraint.m_lowerLimit<=-constraint->getBreakingImpulseThreshold())
 		{
 			solverConstraint.m_lowerLimit = -constraint->getBreakingImpulseThreshold();
 		}
 		solverConstraint.m_originalContactPoint = constraint;
 		{
 			const btVector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal;
 			solverConstraint.m_angularComponentA = constraint->getRigidBodyA().getInvInertiaTensorWorld()*ftorqueAxis1*constraint->getRigidBodyA().getAngularFactor();
 		}
 		{
 			const btVector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal;
 			solverConstraint.m_angularComponentB = constraint->getRigidBodyB().getInvInertiaTensorWorld()*ftorqueAxis2*constraint->getRigidBodyB().getAngularFactor();
 		}
 		{
 			btVector3 iMJlA = solverConstraint.m_contactNormal1*rbA.getInvMass();
 			btVector3 iMJaA = rbA.getInvInertiaTensorWorld()*solverConstraint.m_relpos1CrossNormal;
 			btVector3 iMJlB = solverConstraint.m_contactNormal2*rbB.getInvMass();//sign of normal?
 			btVector3 iMJaB = rbB.getInvInertiaTensorWorld()*solverConstraint.m_relpos2CrossNormal;
 			btScalar sum = iMJlA.dot(solverConstraint.m_contactNormal1);
 			sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
 			sum += iMJlB.dot(solverConstraint.m_contactNormal2);
 			sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
 			btScalar fsum = btFabs(sum);
 			btAssert(fsum > SIMD_EPSILON);
 			btScalar sorRelaxation = 1.f;//todo: get from globalInfo?
 			solverConstraint.m_jacDiagABInv = fsum>SIMD_EPSILON?sorRelaxation/sum : 0.f;
 		}
 		{
 			btScalar rel_vel;
 			btVector3 externalForceImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalForceImpulse : btVector3(0,0,0);
 			btVector3 externalTorqueImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalTorqueImpulse : btVector3(0,0,0);
 			btVector3 externalForceImpulseB = bodyBPtr->m_originalBody ? bodyBPtr->m_externalForceImpulse : btVector3(0,0,0);
 			btVector3 externalTorqueImpulseB = bodyBPtr->m_originalBody ?bodyBPtr->m_externalTorqueImpulse : btVector3(0,0,0);
 			btScalar vel1Dotn = solverConstraint.m_contactNormal1.dot(rbA.getLinearVelocity()+externalForceImpulseA)
 								+ solverConstraint.m_relpos1CrossNormal.dot(rbA.getAngularVelocity()+externalTorqueImpulseA);
 			btScalar vel2Dotn = solverConstraint.m_contactNormal2.dot(rbB.getLinearVelocity()+externalForceImpulseB)
 												+ solverConstraint.m_relpos2CrossNormal.dot(rbB.getAngularVelocity()+externalTorqueImpulseB);
 			rel_vel = vel1Dotn+vel2Dotn;
 			btScalar restitution = 0.f;
 			btScalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2
 			btScalar	velocityError = restitution - rel_vel * info2.m_damping;
 			btScalar	penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv;
 			btScalar	velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv;
 			solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
 			solverConstraint.m_appliedImpulse = 0.f;
 		}
 	}
 }
 void btSequentialImpulseConstraintSolver::convertJoints(btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal)
 {
    BT_PROFILE("convertJoints");
 	for (int j=0;j<numConstraints;j++)
 	{
 		btTypedConstraint* constraint = constraints[j];
 		constraint->buildJacobian();
 		constraint->internalSetAppliedImpulse(0.0f);
 	}
 	int totalNumRows = 0;
 	m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints);
 	//calculate the total number of contraint rows
 	for (int i=0;i<numConstraints;i++)
 	{
 		btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
 		btJointFeedback* fb = constraints[i]->getJointFeedback();
 		if (fb)
 		{
 			fb->m_appliedForceBodyA.setZero();
 			fb->m_appliedTorqueBodyA.setZero();
 			fb->m_appliedForceBodyB.setZero();
 			fb->m_appliedTorqueBodyB.setZero();
 		}
 		if (constraints[i]->isEnabled())
 		{
 			constraints[i]->getInfo1(&info1);
 		} else
 		{
 			info1.m_numConstraintRows = 0;
 			info1.nub = 0;
 		}
 		totalNumRows += info1.m_numConstraintRows;
 	}
 	m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows);
 	///setup the btSolverConstraints
 	int currentRow = 0;
 	for (int i=0;i<numConstraints;i++)
 	{
 		const btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
 		if (info1.m_numConstraintRows)
 		{
 			btAssert(currentRow<totalNumRows);
 			btSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[currentRow];
 			btTypedConstraint* constraint = constraints[i];
 			btRigidBody& rbA = constraint->getRigidBodyA();
 			btRigidBody& rbB = constraint->getRigidBodyB();
 			int solverBodyIdA = getOrInitSolverBody(rbA,infoGlobal.m_timeStep);
            int solverBodyIdB = getOrInitSolverBody(rbB,infoGlobal.m_timeStep);
            convertJoint(currentConstraintRow, constraint, info1, solverBodyIdA, solverBodyIdB, infoGlobal);
        }
 		currentRow+=info1.m_numConstraintRows;
 	}
 }
 void btSequentialImpulseConstraintSolver::convertBodies(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal)
 {
    BT_PROFILE("convertBodies");
 	for (int i = 0; i < numBodies; i++)
 	{
 		bodies[i]->setCompanionId(-1);
 	}
 #if BT_THREADSAFE
    m_kinematicBodyUniqueIdToSolverBodyTable.resize( 0 );
 #endif // BT_THREADSAFE
 	m_tmpSolverBodyPool.reserve(numBodies+1);
 	m_tmpSolverBodyPool.resize(0);
 	//btSolverBody& fixedBody = m_tmpSolverBodyPool.expand();
    //initSolverBody(&fixedBody,0);
    for (int i=0;i<numBodies;i++)
 	{
 		int bodyId = getOrInitSolverBody(*bodies[i],infoGlobal.m_timeStep);
 		btRigidBody* body = btRigidBody::upcast(bodies[i]);
 		if (body && body->getInvMass())
 		{
 			btSolverBody& solverBody = m_tmpSolverBodyPool[bodyId];
 			btVector3 gyroForce (0,0,0);
 			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_EXPLICIT)
 			{
 				gyroForce = body->computeGyroscopicForceExplicit(infoGlobal.m_maxGyroscopicForce);
 				solverBody.m_externalTorqueImpulse -= gyroForce*body->getInvInertiaTensorWorld()*infoGlobal.m_timeStep;
 			}
 			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_WORLD)
 			{
 				gyroForce = body->computeGyroscopicImpulseImplicit_World(infoGlobal.m_timeStep);
 				solverBody.m_externalTorqueImpulse += gyroForce;
 			}
 			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_BODY)
 			{
 				gyroForce = body->computeGyroscopicImpulseImplicit_Body(infoGlobal.m_timeStep);
 				solverBody.m_externalTorqueImpulse += gyroForce;
 			}
 		}
 	}
 }
 btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
 {
 	m_fixedBodyId = -1;
@@ -1344,250 +1594,13 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 #endif //BT_ADDITIONAL_DEBUG
 	for (int i = 0; i < numBodies; i++)
 	{
 		bodies[i]->setCompanionId(-1);
 	}
 #if BT_THREADSAFE
    m_kinematicBodyUniqueIdToSolverBodyTable.resize( 0 );
 #endif // BT_THREADSAFE
 	m_tmpSolverBodyPool.reserve(numBodies+1);
 	m_tmpSolverBodyPool.resize(0);
 	//btSolverBody& fixedBody = m_tmpSolverBodyPool.expand();
    //initSolverBody(&fixedBody,0);
 	//convert all bodies
    convertBodies(bodies, numBodies, infoGlobal);
    convertJoints(constraints, numConstraints, infoGlobal);
-	for (int i=0;i<numBodies;i++)
+	convertContacts(manifoldPtr,numManifolds,infoGlobal);
 	{
 		int bodyId = getOrInitSolverBody(*bodies[i],infoGlobal.m_timeStep);
 		btRigidBody* body = btRigidBody::upcast(bodies[i]);
 		if (body && body->getInvMass())
 		{
 			btSolverBody& solverBody = m_tmpSolverBodyPool[bodyId];
 			btVector3 gyroForce (0,0,0);
 			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_EXPLICIT)
 			{
 				gyroForce = body->computeGyroscopicForceExplicit(infoGlobal.m_maxGyroscopicForce);
 				solverBody.m_externalTorqueImpulse -= gyroForce*body->getInvInertiaTensorWorld()*infoGlobal.m_timeStep;
 			}
 			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_WORLD)
 			{
 				gyroForce = body->computeGyroscopicImpulseImplicit_World(infoGlobal.m_timeStep);
 				solverBody.m_externalTorqueImpulse += gyroForce;
 			}
 			if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_BODY)
 			{
 				gyroForce = body->computeGyroscopicImpulseImplicit_Body(infoGlobal.m_timeStep);
 				solverBody.m_externalTorqueImpulse += gyroForce;
 			}
 		}
 	}
 	if (1)
 	{
 		int j;
 		for (j=0;j<numConstraints;j++)
 		{
 			btTypedConstraint* constraint = constraints[j];
 			constraint->buildJacobian();
 			constraint->internalSetAppliedImpulse(0.0f);
 		}
 	}
 	//btRigidBody* rb0=0,*rb1=0;
 	//if (1)
 	{
 		{
 			int totalNumRows = 0;
 			int i;
 			m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints);
 			//calculate the total number of contraint rows
 			for (i=0;i<numConstraints;i++)
 			{
 				btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
 				btJointFeedback* fb = constraints[i]->getJointFeedback();
 				if (fb)
 				{
 					fb->m_appliedForceBodyA.setZero();
 					fb->m_appliedTorqueBodyA.setZero();
 					fb->m_appliedForceBodyB.setZero();
 					fb->m_appliedTorqueBodyB.setZero();
 				}
 				if (constraints[i]->isEnabled())
 				{
 					constraints[i]->getInfo1(&info1);
 				} else
 				{
 					info1.m_numConstraintRows = 0;
 					info1.nub = 0;
 				}
 				totalNumRows += info1.m_numConstraintRows;
 			}
 			m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows);
 			///setup the btSolverConstraints
 			int currentRow = 0;
 			for (i=0;i<numConstraints;i++)
 			{
 				const btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
 				if (info1.m_numConstraintRows)
 				{
 					btAssert(currentRow<totalNumRows);
 					btSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[currentRow];
 					btTypedConstraint* constraint = constraints[i];
 					btRigidBody& rbA = constraint->getRigidBodyA();
 					btRigidBody& rbB = constraint->getRigidBodyB();
 					int solverBodyIdA = getOrInitSolverBody(rbA,infoGlobal.m_timeStep);
                    int solverBodyIdB = getOrInitSolverBody(rbB,infoGlobal.m_timeStep);
                    btSolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA];
                    btSolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB];
 					int overrideNumSolverIterations = constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;
 					if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)
 						m_maxOverrideNumSolverIterations = overrideNumSolverIterations;
 					int j;
 					for ( j=0;j<info1.m_numConstraintRows;j++)
 					{
 						memset(&currentConstraintRow[j],0,sizeof(btSolverConstraint));
 						currentConstraintRow[j].m_lowerLimit = -SIMD_INFINITY;
 						currentConstraintRow[j].m_upperLimit = SIMD_INFINITY;
 						currentConstraintRow[j].m_appliedImpulse = 0.f;
 						currentConstraintRow[j].m_appliedPushImpulse = 0.f;
 						currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;
 						currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;
 						currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;
 					}
 					bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
 					bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
 					bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
 					bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
 					bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
 					bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
 					bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f);
 					bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f);
 					btTypedConstraint::btConstraintInfo2 info2;
 					info2.fps = 1.f/infoGlobal.m_timeStep;
 					info2.erp = infoGlobal.m_erp;
 					info2.m_J1linearAxis = currentConstraintRow->m_contactNormal1;
 					info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal;
 					info2.m_J2linearAxis = currentConstraintRow->m_contactNormal2;
 					info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal;
 					info2.rowskip = sizeof(btSolverConstraint)/sizeof(btScalar);//check this
 					///the size of btSolverConstraint needs be a multiple of btScalar
 		            btAssert(info2.rowskip*sizeof(btScalar)== sizeof(btSolverConstraint));
 					info2.m_constraintError = &currentConstraintRow->m_rhs;
 					currentConstraintRow->m_cfm = infoGlobal.m_globalCfm;
 					info2.m_damping = infoGlobal.m_damping;
 					info2.cfm = &currentConstraintRow->m_cfm;
 					info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;
 					info2.m_upperLimit = &currentConstraintRow->m_upperLimit;
 					info2.m_numIterations = infoGlobal.m_numIterations;
 					constraints[i]->getInfo2(&info2);
 					///finalize the constraint setup
 					for ( j=0;j<info1.m_numConstraintRows;j++)
 					{
 						btSolverConstraint& solverConstraint = currentConstraintRow[j];
 						if (solverConstraint.m_upperLimit>=constraints[i]->getBreakingImpulseThreshold())
 						{
 							solverConstraint.m_upperLimit = constraints[i]->getBreakingImpulseThreshold();
 						}
 						if (solverConstraint.m_lowerLimit<=-constraints[i]->getBreakingImpulseThreshold())
 						{
 							solverConstraint.m_lowerLimit = -constraints[i]->getBreakingImpulseThreshold();
 						}
 						solverConstraint.m_originalContactPoint = constraint;
 						{
 							const btVector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal;
 							solverConstraint.m_angularComponentA = constraint->getRigidBodyA().getInvInertiaTensorWorld()*ftorqueAxis1*constraint->getRigidBodyA().getAngularFactor();
 						}
 						{
 							const btVector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal;
 							solverConstraint.m_angularComponentB = constraint->getRigidBodyB().getInvInertiaTensorWorld()*ftorqueAxis2*constraint->getRigidBodyB().getAngularFactor();
 						}
 						{
 							btVector3 iMJlA = solverConstraint.m_contactNormal1*rbA.getInvMass();
 							btVector3 iMJaA = rbA.getInvInertiaTensorWorld()*solverConstraint.m_relpos1CrossNormal;
 							btVector3 iMJlB = solverConstraint.m_contactNormal2*rbB.getInvMass();//sign of normal?
 							btVector3 iMJaB = rbB.getInvInertiaTensorWorld()*solverConstraint.m_relpos2CrossNormal;
 							btScalar sum = iMJlA.dot(solverConstraint.m_contactNormal1);
 							sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
 							sum += iMJlB.dot(solverConstraint.m_contactNormal2);
 							sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
 							btScalar fsum = btFabs(sum);
 							btAssert(fsum > SIMD_EPSILON);
 							btScalar sorRelaxation = 1.f;//todo: get from globalInfo?
 							solverConstraint.m_jacDiagABInv = fsum>SIMD_EPSILON?sorRelaxation/sum : 0.f;
 						}
 						{
 							btScalar rel_vel;
 							btVector3 externalForceImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalForceImpulse : btVector3(0,0,0);
 							btVector3 externalTorqueImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalTorqueImpulse : btVector3(0,0,0);
 							btVector3 externalForceImpulseB = bodyBPtr->m_originalBody ? bodyBPtr->m_externalForceImpulse : btVector3(0,0,0);
 							btVector3 externalTorqueImpulseB = bodyBPtr->m_originalBody ?bodyBPtr->m_externalTorqueImpulse : btVector3(0,0,0);
 							btScalar vel1Dotn = solverConstraint.m_contactNormal1.dot(rbA.getLinearVelocity()+externalForceImpulseA)
 												+ solverConstraint.m_relpos1CrossNormal.dot(rbA.getAngularVelocity()+externalTorqueImpulseA);
 							btScalar vel2Dotn = solverConstraint.m_contactNormal2.dot(rbB.getLinearVelocity()+externalForceImpulseB)
 																+ solverConstraint.m_relpos2CrossNormal.dot(rbB.getAngularVelocity()+externalTorqueImpulseB);
 							rel_vel = vel1Dotn+vel2Dotn;
 							btScalar restitution = 0.f;
 							btScalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2
 							btScalar	velocityError = restitution - rel_vel * info2.m_damping;
 							btScalar	penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv;
 							btScalar	velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv;
 							solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
 							solverConstraint.m_appliedImpulse = 0.f;
 						}
 					}
 				}
 				currentRow+=m_tmpConstraintSizesPool[i].m_numConstraintRows;
 			}
 		}
 		convertContacts(manifoldPtr,numManifolds,infoGlobal);
 	}
 //	btContactSolverInfo info = infoGlobal;
@@ -1627,6 +1640,7 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol
 btScalar btSequentialImpulseConstraintSolver::solveSingleIteration(int iteration, btCollisionObject** /*bodies */,int /*numBodies*/,btPersistentManifold** /*manifoldPtr*/, int /*numManifolds*/,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* /*debugDrawer*/)
 {
    BT_PROFILE("solveSingleIteration");
 	btScalar leastSquaresResidual = 0.f;
 	int numNonContactPool = m_tmpSolverNonContactConstraintPool.size();
@@ -1805,6 +1819,7 @@ btScalar btSequentialImpulseConstraintSolver::solveSingleIteration(int iteration
 void btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
 {
 	BT_PROFILE("solveGroupCacheFriendlySplitImpulseIterations");
 	int iteration;
 	if (infoGlobal.m_splitImpulse)
 	{
@@ -1863,14 +1878,9 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyIterations(
 	return 0.f;
 }
-btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal)
+void btSequentialImpulseConstraintSolver::writeBackContacts(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal)
 {
-	int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+		for (int j=iBegin; j<iEnd; j++)
 	int i,j;
 	if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
 	{
 		for (j=0;j<numPoolConstraints;j++)
 		{
 			const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[j];
 			btManifoldPoint* pt = (btManifoldPoint*) solveManifold.m_originalContactPoint;
@@ -1886,10 +1896,11 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCo
 			}
 			//do a callback here?
 		}
-	}
+}
-	numPoolConstraints = m_tmpSolverNonContactConstraintPool.size();
+void btSequentialImpulseConstraintSolver::writeBackJoints(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal)
-	for (j=0;j<numPoolConstraints;j++)
+{
 	for (int j=iBegin; j<iEnd; j++)
 	{
 		const btSolverConstraint& solverConstr = m_tmpSolverNonContactConstraintPool[j];
 		btTypedConstraint* constr = (btTypedConstraint*)solverConstr.m_originalContactPoint;
@@ -1909,10 +1920,12 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCo
 			constr->setEnabled(false);
 		}
 	}
 }
-
+void btSequentialImpulseConstraintSolver::writeBackBodies(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal)
-	for ( i=0;i<m_tmpSolverBodyPool.size();i++)
+{
 	for (int i=iBegin; i<iEnd; i++)
 	{
 		btRigidBody* body = m_tmpSolverBodyPool[i].m_originalBody;
 		if (body)
@@ -1936,6 +1949,19 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCo
 			m_tmpSolverBodyPool[i].m_originalBody->setCompanionId(-1);
 		}
 	}
 }
 btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal)
 {
 	BT_PROFILE("solveGroupCacheFriendlyFinish");
 	if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING)
 	{
        writeBackContacts(0, m_tmpSolverContactConstraintPool.size(), infoGlobal);
 	}
    writeBackJoints(0, m_tmpSolverNonContactConstraintPool.size(), infoGlobal);
    writeBackBodies(0, m_tmpSolverBodyPool.size(), infoGlobal);
 	m_tmpSolverContactConstraintPool.resizeNoInitialize(0);
 	m_tmpSolverNonContactConstraintPool.resizeNoInitialize(0);
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h
@@ -95,6 +95,10 @@ protected:
 	void	convertContact(btPersistentManifold* manifold,const btContactSolverInfo& infoGlobal);
    virtual void convertJoints(btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal);
    void convertJoint(btSolverConstraint* destConstraintRow, btTypedConstraint* srcConstraint, const btTypedConstraint::btConstraintInfo1& info1, int solverBodyIdA, int solverBodyIdB, const btContactSolverInfo& infoGlobal);
    virtual void convertBodies(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal);
 	btSimdScalar	resolveSplitPenetrationSIMD(btSolverBody& bodyA,btSolverBody& bodyB, const btSolverConstraint& contactConstraint)
    {
@@ -121,7 +125,9 @@ protected:
 protected:
-	
+    void writeBackContacts(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
    void writeBackJoints(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
    void writeBackBodies(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
 	virtual void solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
 	virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal);
 	virtual btScalar solveSingleIteration(int iteration, btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp
--- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h
+++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h
@@ -0,0 +1,154 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose, 
 including commercial applications, and to alter it and redistribute it freely, 
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #ifndef BT_SEQUENTIAL_IMPULSE_CONSTRAINT_SOLVER_MT_H
 #define BT_SEQUENTIAL_IMPULSE_CONSTRAINT_SOLVER_MT_H
 #include "btSequentialImpulseConstraintSolver.h"
 #include "btBatchedConstraints.h"
 #include "LinearMath/btThreads.h"
 ///
 /// btSequentialImpulseConstraintSolverMt
 ///
 ///  A multithreaded variant of the sequential impulse constraint solver. The constraints to be solved are grouped into
 ///  batches and phases where each batch of constraints within a given phase can be solved in parallel with the rest.
 ///  Ideally we want as few phases as possible, and each phase should have many batches, and all of the batches should
 ///  have about the same number of constraints.
 ///  This method works best on a large island of many constraints.
 ///
 ///  Supports all of the features of the normal sequential impulse solver such as:
 ///    - split penetration impulse
 ///    - rolling friction
 ///    - interleaving constraints
 ///    - warmstarting
 ///    - 2 friction directions
 ///    - randomized constraint ordering
 ///    - early termination when leastSquaresResidualThreshold is satisfied
 ///
 ///  When the SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS flag is enabled, unlike the normal SequentialImpulse solver,
 ///  the rolling friction is interleaved as well.
 ///  Interleaving the contact penetration constraints with friction reduces the number of parallel loops that need to be done,
 ///  which reduces threading overhead so it can be a performance win, however, it does seem to produce a less stable simulation,
 ///  at least on stacks of blocks.
 ///
 ///  When the SOLVER_RANDMIZE_ORDER flag is enabled, the ordering of phases, and the ordering of constraints within each batch
 ///  is randomized, however it does not swap constraints between batches.
 ///  This is to avoid regenerating the batches for each solver iteration which would be quite costly in performance.
 ///
 ///  Note that a non-zero leastSquaresResidualThreshold could possibly affect the determinism of the simulation
 ///  if the task scheduler's parallelSum operation is non-deterministic. The parallelSum operation can be non-deterministic
 ///  because floating point addition is not associative due to rounding errors.
 ///  The task scheduler can and should ensure that the result of any parallelSum operation is deterministic.
 ///
 ATTRIBUTE_ALIGNED16(class) btSequentialImpulseConstraintSolverMt : public btSequentialImpulseConstraintSolver
 {
 public:
 	virtual void solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE;
 	virtual btScalar solveSingleIteration(int iteration, btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE;
 	virtual btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE;
 	virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal) BT_OVERRIDE;
    // temp struct used to collect info from persistent manifolds into a cache-friendly struct using multiple threads
    struct btContactManifoldCachedInfo
    {
        static const int MAX_NUM_CONTACT_POINTS = 4;
        int numTouchingContacts;
        int solverBodyIds[ 2 ];
        int contactIndex;
        int rollingFrictionIndex;
        bool contactHasRollingFriction[ MAX_NUM_CONTACT_POINTS ];
        btManifoldPoint* contactPoints[ MAX_NUM_CONTACT_POINTS ];
    };
    // temp struct used for setting up joint constraints in parallel
    struct JointParams
    {
        int m_solverConstraint;
        int m_solverBodyA;
        int m_solverBodyB;
    };
    void internalInitMultipleJoints(btTypedConstraint** constraints, int iBegin, int iEnd);
    void internalConvertMultipleJoints( const btAlignedObjectArray<JointParams>& jointParamsArray, btTypedConstraint** constraints, int iBegin, int iEnd, const btContactSolverInfo& infoGlobal );
    // parameters to control batching
    static bool s_allowNestedParallelForLoops;        // whether to allow nested parallel operations
    static int s_minimumContactManifoldsForBatching;  // don't even try to batch if fewer manifolds than this
    static btBatchedConstraints::BatchingMethod s_contactBatchingMethod;
    static btBatchedConstraints::BatchingMethod s_jointBatchingMethod;
    static int s_minBatchSize;  // desired number of constraints per batch
    static int s_maxBatchSize;
 protected:
    static const int CACHE_LINE_SIZE = 64;
    btBatchedConstraints m_batchedContactConstraints;
    btBatchedConstraints m_batchedJointConstraints;
    int m_numFrictionDirections;
    bool m_useBatching;
    bool m_useObsoleteJointConstraints;
    btAlignedObjectArray<btContactManifoldCachedInfo> m_manifoldCachedInfoArray;
    btAlignedObjectArray<int> m_rollingFrictionIndexTable;  // lookup table mapping contact index to rolling friction index
    btSpinMutex m_bodySolverArrayMutex;
    char m_antiFalseSharingPadding[CACHE_LINE_SIZE]; // padding to keep mutexes in separate cachelines
    btSpinMutex m_kinematicBodyUniqueIdToSolverBodyTableMutex;
    btAlignedObjectArray<char> m_scratchMemory;
    virtual void randomizeConstraintOrdering( int iteration, int numIterations );
    virtual btScalar resolveAllJointConstraints( int iteration );
    virtual btScalar resolveAllContactConstraints();
    virtual btScalar resolveAllContactFrictionConstraints();
    virtual btScalar resolveAllContactConstraintsInterleaved();
    virtual btScalar resolveAllRollingFrictionConstraints();
    virtual void setupBatchedContactConstraints();
    virtual void setupBatchedJointConstraints();
    virtual void convertJoints(btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal) BT_OVERRIDE;
 	virtual void convertContacts(btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal) BT_OVERRIDE;
    virtual void convertBodies(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal) BT_OVERRIDE;
 	int getOrInitSolverBodyThreadsafe(btCollisionObject& body, btScalar timeStep);
    void allocAllContactConstraints(btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal);
    void setupAllContactConstraints(const btContactSolverInfo& infoGlobal);
    void randomizeBatchedConstraintOrdering( btBatchedConstraints* batchedConstraints );
 public:
 	BT_DECLARE_ALIGNED_ALLOCATOR();
 	btSequentialImpulseConstraintSolverMt();
 	virtual ~btSequentialImpulseConstraintSolverMt();
    btScalar resolveMultipleJointConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd, int iteration );
    btScalar resolveMultipleContactConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
    btScalar resolveMultipleContactSplitPenetrationImpulseConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
    btScalar resolveMultipleContactFrictionConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
    btScalar resolveMultipleContactRollingFrictionConstraints( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
    btScalar resolveMultipleContactConstraintsInterleaved( const btAlignedObjectArray<int>& consIndices, int batchBegin, int batchEnd );
    void internalCollectContactManifoldCachedInfo(btContactManifoldCachedInfo* cachedInfoArray, btPersistentManifold** manifold, int numManifolds, const btContactSolverInfo& infoGlobal);
    void internalAllocContactConstraints(const btContactManifoldCachedInfo* cachedInfoArray, int numManifolds);
    void internalSetupContactConstraints(int iContact, const btContactSolverInfo& infoGlobal);
    void internalConvertBodies(btCollisionObject** bodies, int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
    void internalWriteBackContacts(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
    void internalWriteBackJoints(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
    void internalWriteBackBodies(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal);
 };
 #endif //BT_SEQUENTIAL_IMPULSE_CONSTRAINT_SOLVER_MT_H
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp
@@ -325,3 +325,14 @@ void btDiscreteDynamicsWorldMt::integrateTransforms( btScalar timeStep )
    }
 }
 int	btDiscreteDynamicsWorldMt::stepSimulation( btScalar timeStep, int maxSubSteps, btScalar fixedTimeStep )
 {
    int numSubSteps = btDiscreteDynamicsWorld::stepSimulation(timeStep, maxSubSteps, fixedTimeStep);
    if (btITaskScheduler* scheduler = btGetTaskScheduler())
    {
        // tell Bullet's threads to sleep, so other threads can run
        scheduler->sleepWorkerThreadsHint();
    }
    return numSubSteps;
 }
--- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
+++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h
@@ -129,6 +129,8 @@ public:
        btCollisionConfiguration* collisionConfiguration
    );
 	virtual ~btDiscreteDynamicsWorldMt();
    virtual int	stepSimulation( btScalar timeStep, int maxSubSteps, btScalar fixedTimeStep ) BT_OVERRIDE;
 };
 #endif //BT_DISCRETE_DYNAMICS_WORLD_H
--- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
+++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp
@@ -22,6 +22,7 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 #include "BulletCollision/CollisionDispatch/btCollisionWorld.h"
 #include "BulletDynamics/ConstraintSolver/btTypedConstraint.h"
 #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h"  // for s_minimumContactManifoldsForBatching
 //#include <stdio.h>
 #include "LinearMath/btQuickprof.h"
@@ -589,14 +590,52 @@ struct UpdateIslandDispatcher : public btIParallelForBody
    }
 };
 void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray<Island*>* islandsPtr, IslandCallback* callback )
 {
    BT_PROFILE( "parallelIslandDispatch" );
-    int grainSize = 1;  // iterations per task
+    //
    // if there are islands with many contacts, it may be faster to submit these
    // large islands *serially* to a single parallel constraint solver, and then later
    // submit the remaining smaller islands in parallel to multiple sequential solvers.
    //
    // Some task schedulers do not deal well with nested parallelFor loops. One implementation
    // of OpenMP was actually slower than doing everything single-threaded. Intel TBB
    // on the other hand, seems to do a pretty respectable job with it.
    //
    // When solving islands in parallel, the worst case performance happens when there
    // is one very large island and then perhaps a smattering of very small
    // islands -- one worker thread takes the large island and the remaining workers
    // tear through the smaller islands and then sit idle waiting for the first worker
    // to finish. Solving islands in parallel works best when there are numerous small
    // islands, roughly equal in size.
    //
    // By contrast, the other approach -- the parallel constraint solver -- is only
    // able to deliver a worthwhile speedup when the island is large. For smaller islands,
    // it is difficult to extract a useful amount of parallelism -- the overhead of grouping
    // the constraints into batches and sending the batches to worker threads can nullify
    // any gains from parallelism.
    //
    UpdateIslandDispatcher dispatcher;
    dispatcher.islandsPtr = islandsPtr;
    dispatcher.callback = callback;
-    btParallelFor( 0, islandsPtr->size(), grainSize, dispatcher );
+    // We take advantage of the fact the islands are sorted in order of decreasing size
    int iBegin = 0;
    while (iBegin < islandsPtr->size())
    {
        btSimulationIslandManagerMt::Island* island = (*islandsPtr)[ iBegin ];
        if (island->manifoldArray.size() < btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching)
        {
            // OK to submit the rest of the array in parallel
            break;
        }
        ++iBegin;
    }
    // serial dispatch for large islands (if any)
    dispatcher.forLoop(0, iBegin);
    // parallel dispatch for rest
    btParallelFor( iBegin, islandsPtr->size(), 1, dispatcher );
 }
--- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
+++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h
@@ -106,5 +106,7 @@ public:
    }
 };
 extern int gLargeIslandManifoldCount;
 #endif //BT_SIMULATION_ISLAND_MANAGER_H
--- a/src/LinearMath/CMakeLists.txt
+++ b/src/LinearMath/CMakeLists.txt
@@ -14,6 +14,9 @@ SET(LinearMath_SRCS
 	btSerializer64.cpp
 	btThreads.cpp
 	btVector3.cpp
 	TaskScheduler/btTaskScheduler.cpp
 	TaskScheduler/btThreadSupportPosix.cpp
 	TaskScheduler/btThreadSupportWin32.cpp
 )
 SET(LinearMath_HDRS
@@ -44,6 +47,7 @@ SET(LinearMath_HDRS
 	btTransform.h
 	btTransformUtil.h
 	btVector3.h
 	TaskScheduler/btThreadSupportInterface.h
 )
 ADD_LIBRARY(LinearMath ${LinearMath_SRCS} ${LinearMath_HDRS})
--- a/src/LinearMath/TaskScheduler/btTaskScheduler.cpp
+++ b/src/LinearMath/TaskScheduler/btTaskScheduler.cpp
@@ -0,0 +1,619 @@
 #include "LinearMath/btMinMax.h"
 #include "LinearMath/btAlignedObjectArray.h"
 #include "LinearMath/btThreads.h"
 #include "LinearMath/btQuickprof.h"
 #include <stdio.h>
 #include <algorithm>
 typedef void( *btThreadFunc )( void* userPtr, void* lsMemory );
 typedef void* ( *btThreadLocalStorageFunc )();
 #if BT_THREADSAFE
 #include "btThreadSupportInterface.h"
 ///
 /// getNumHardwareThreads()
 ///
 ///
 /// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
 ///
 #if __cplusplus >= 201103L
 #include <thread>
 int getNumHardwareThreads()
 {
    return std::thread::hardware_concurrency();
 }
 #elif defined( _WIN32 )
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 int getNumHardwareThreads()
 {
    // caps out at 32
    SYSTEM_INFO info;
    GetSystemInfo( &info );
    return info.dwNumberOfProcessors;
 }
 #else
 int getNumHardwareThreads()
 {
    return 0;  // don't know
 }
 #endif
 void btSpinPause()
 {
 #if defined( _WIN32 )
    YieldProcessor();
 #endif
 }
 struct WorkerThreadStatus
 {
    enum Type
    {
        kInvalid,
        kWaitingForWork,
        kWorking,
        kSleeping,
    };
 };
 struct IJob
 {
    virtual void executeJob(int threadId) = 0;
 };
 class ParallelForJob : public IJob
 {
    const btIParallelForBody* mBody;
    int mBegin;
    int mEnd;
 public:
    ParallelForJob( int iBegin, int iEnd, const btIParallelForBody& body )
    {
        mBody = &body;
        mBegin = iBegin;
        mEnd = iEnd;
    }
    virtual void executeJob(int threadId) BT_OVERRIDE
    {
        BT_PROFILE( "executeJob" );
        // call the functor body to do the work
        mBody->forLoop( mBegin, mEnd );
    }
 };
 static const int kCacheLineSize = 64;
 struct ThreadLocalSum
 {
    btScalar mSum;
    char mCachePadding[ kCacheLineSize - sizeof( btScalar ) ];
 };
 class ParallelSumJob : public IJob
 {
    const btIParallelSumBody* mBody;
    ThreadLocalSum* mSumArray;
    int mBegin;
    int mEnd;
 public:
    ParallelSumJob( int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalSum* sums )
    {
        mBody = &body;
        mSumArray = sums;
        mBegin = iBegin;
        mEnd = iEnd;
    }
    virtual void executeJob( int threadId ) BT_OVERRIDE
    {
        BT_PROFILE( "executeJob" );
        // call the functor body to do the work
        btScalar val = mBody->sumLoop( mBegin, mEnd );
        // by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision)
        const float TRUNC_SCALE = float(1<<19);
        val = floor(val*TRUNC_SCALE+0.5f)/TRUNC_SCALE;  // truncate some bits
        mSumArray[threadId].mSum += val;
    }
 };
 struct JobContext
 {
    JobContext()
    {
        m_queueLock = NULL;
        m_headIndex = 0;
        m_tailIndex = 0;
        m_workersShouldCheckQueue = false;
        m_workersShouldSleep = false;
        m_useSpinMutex = false;
        m_coolDownTime = 1000; // 1000 microseconds
    }
    btCriticalSection* m_queueLock;
    btSpinMutex m_mutex;
    volatile bool m_workersShouldCheckQueue;
    volatile bool m_workersShouldSleep;
    btAlignedObjectArray<IJob*> m_jobQueue;
    bool m_queueIsEmpty;
    int m_tailIndex;
    int m_headIndex;
    bool m_useSpinMutex;
    unsigned int m_coolDownTime;
    btClock m_clock;
    void lockQueue()
    {
        if ( m_useSpinMutex )
        {
            m_mutex.lock();
        }
        else
        {
            m_queueLock->lock();
        }
    }
    void unlockQueue()
    {
        if ( m_useSpinMutex )
        {
            m_mutex.unlock();
        }
        else
        {
            m_queueLock->unlock();
        }
    }
    void clearQueue()
    {
        lockQueue();
        m_headIndex = 0;
        m_tailIndex = 0;
        m_queueIsEmpty = true;
        unlockQueue();
        m_jobQueue.resizeNoInitialize( 0 );
    }
    void submitJob( IJob* job )
    {
        m_jobQueue.push_back( job );
        lockQueue();
        m_tailIndex++;
        m_queueIsEmpty = false;
        unlockQueue();
    }
    IJob* consumeJob()
    {
        if ( m_queueIsEmpty )
        {
            // lock free path. even if this is taken erroneously it isn't harmful
            return NULL;
        }
        IJob* job = NULL;
        lockQueue();
        if ( !m_queueIsEmpty )
        {
            job = m_jobQueue[ m_headIndex++ ];
            if ( m_headIndex == m_tailIndex )
            {
                m_queueIsEmpty = true;
            }
        }
        unlockQueue();
        return job;
    }
 };
 struct WorkerThreadLocalStorage
 {
    int threadId;
    WorkerThreadStatus::Type status;
    int numJobsFinished;
    btSpinMutex m_mutex;
 };
 static void WorkerThreadFunc( void* userPtr, void* lsMemory )
 {
    BT_PROFILE( "WorkerThreadFunc" );
    WorkerThreadLocalStorage* localStorage = (WorkerThreadLocalStorage*) lsMemory;
    JobContext* jobContext = (JobContext*) userPtr;
    bool shouldSleep = false;
    while (! shouldSleep)
    {
        // do work
        localStorage->m_mutex.lock();
        while ( IJob* job = jobContext->consumeJob() )
        {
            localStorage->status = WorkerThreadStatus::kWorking;
            job->executeJob( localStorage->threadId );
            localStorage->numJobsFinished++;
        }
        localStorage->status = WorkerThreadStatus::kWaitingForWork;
        localStorage->m_mutex.unlock();
        unsigned long long int clockStart = jobContext->m_clock.getTimeMicroseconds();
        // while queue is empty,
        while (jobContext->m_queueIsEmpty)
        {
            // todo: spin wait a bit to avoid hammering the empty queue
            btSpinPause();
            if ( jobContext->m_workersShouldSleep )
            {
                shouldSleep = true;
                break;
            }
            // if jobs are incoming,
            if (jobContext->m_workersShouldCheckQueue)
            {
                clockStart = jobContext->m_clock.getTimeMicroseconds(); // reset clock
            }
            else
            {
                // if no jobs incoming and queue has been empty for the cooldown time, sleep
                unsigned long long int timeElapsed = jobContext->m_clock.getTimeMicroseconds() - clockStart;
                if (timeElapsed > jobContext->m_coolDownTime)
                {
                    shouldSleep = true;
                    break;
                }
            }
        }
    }
    // go idle
    localStorage->m_mutex.lock();
    localStorage->status = WorkerThreadStatus::kSleeping;
    localStorage->m_mutex.unlock();
 }
 static void* WorkerThreadAllocFunc()
 {
    return new WorkerThreadLocalStorage;
 }
 class btTaskSchedulerDefault : public btITaskScheduler
 {
    JobContext m_jobContext;
    btThreadSupportInterface* m_threadSupport;
    btAlignedObjectArray<char> m_jobMem;
    btAlignedObjectArray<char> m_threadLocalMem;
    btSpinMutex m_antiNestingLock;  // prevent nested parallel-for
    int m_numThreads;
    int m_numWorkerThreads;
    int m_maxNumThreads;
    int m_numJobs;
 public:
    btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport")
    {
        m_threadSupport = NULL;
    }
    virtual ~btTaskSchedulerDefault()
    {
        shutdown();
    }
    void init()
    {
        btThreadSupportInterface::ConstructionInfo constructionInfo( "TaskScheduler", WorkerThreadFunc, WorkerThreadAllocFunc );
        m_threadSupport = btThreadSupportInterface::create( constructionInfo );
        m_numWorkerThreads = m_threadSupport->getNumWorkerThreads();
        m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1;
        m_numThreads = m_maxNumThreads;
        m_jobContext.m_queueLock = m_threadSupport->createCriticalSection();
        for ( int i = 0; i < m_numWorkerThreads; i++ )
        {
            WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i );
            btAssert( storage );
            storage->threadId = i + 1;  // workers start at 1
            storage->status = WorkerThreadStatus::kSleeping;
        }
        setWorkersActive( false ); // no work for them yet
        setNumThreads( m_threadSupport->getCacheFriendlyNumThreads() );
    }
    virtual void shutdown()
    {
        setWorkersActive( false );
        waitForWorkersToSleep();
        m_threadSupport->deleteCriticalSection( m_jobContext.m_queueLock );
        m_jobContext.m_queueLock = NULL;
        delete m_threadSupport;
        m_threadSupport = NULL;
    }
    void setWorkersActive( bool active )
    {
        m_jobContext.m_workersShouldCheckQueue = active;
    }
    virtual int getMaxNumThreads() const BT_OVERRIDE
    {
        return m_maxNumThreads;
    }
    virtual int getNumThreads() const BT_OVERRIDE
    {
        return m_numThreads;
    }
    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
    {
        m_numThreads = btMax( btMin(numThreads, int(m_maxNumThreads)), 1 );
        m_numWorkerThreads = m_numThreads - 1;
    }
    void waitJobs()
    {
        BT_PROFILE( "waitJobs" );
        // have the main thread work until the job queue is empty
        int numMainThreadJobsFinished = 0;
        while ( IJob* job = m_jobContext.consumeJob() )
        {
            job->executeJob( 0 );
            numMainThreadJobsFinished++;
        }
        // done with jobs for now, tell workers to rest
        setWorkersActive( false );
        unsigned long long int clockStart = m_jobContext.m_clock.getTimeMicroseconds();
        // wait for workers to finish any jobs in progress
        while ( true )
        {
            int numWorkerJobsFinished = 0;
            for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker )
            {
                WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory( iWorker ) );
                storage->m_mutex.lock();
                numWorkerJobsFinished += storage->numJobsFinished;
                storage->m_mutex.unlock();
            }
            if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs)
            {
                break;
            }
            unsigned long long int timeElapsed = m_jobContext.m_clock.getTimeMicroseconds() - clockStart;
            btAssert(timeElapsed < 1000);
            if (timeElapsed > 100000)
            {
                break;
            }
            btSpinPause();
        }
    }
    void wakeWorkers(int numWorkersToWake)
    {
        BT_PROFILE( "wakeWorkers" );
        btAssert( m_jobContext.m_workersShouldCheckQueue );
        int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads);
        int numActiveWorkers = 0;
        for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker )
        {
            // note this count of active workers is not necessarily totally reliable, because a worker thread could be
            // just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare.
            WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory( iWorker ) );
            if (storage->status != WorkerThreadStatus::kSleeping)
            {
                numActiveWorkers++;
            }
        }
        for ( int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker )
        {
            WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory( iWorker ) );
            if (storage->status == WorkerThreadStatus::kSleeping)
            {
                m_threadSupport->runTask( iWorker, &m_jobContext );
                numActiveWorkers++;
            }
        }
    }
    void waitForWorkersToSleep()
    {
        BT_PROFILE( "waitForWorkersToSleep" );
        m_jobContext.m_workersShouldSleep = true;
        m_threadSupport->waitForAllTasks();
        for ( int i = 0; i < m_numWorkerThreads; i++ )
        {
            WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory(i) );
            btAssert( storage );
            btAssert( storage->status == WorkerThreadStatus::kSleeping );
        }
    }
    virtual void sleepWorkerThreadsHint() BT_OVERRIDE
    {
        BT_PROFILE( "sleepWorkerThreadsHint" );
        // hint the task scheduler that we may not be using these threads for a little while
        m_jobContext.m_workersShouldSleep = true;
    }
    void prepareWorkerThreads()
    {
        for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker )
        {
            WorkerThreadLocalStorage* storage = static_cast<WorkerThreadLocalStorage*>( m_threadSupport->getThreadLocalMemory( iWorker ) );
            storage->m_mutex.lock();
            storage->numJobsFinished = 0;
            storage->m_mutex.unlock();
        }
        m_jobContext.m_workersShouldSleep = false;
        setWorkersActive( true );
    }
    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
    {
        BT_PROFILE( "parallelFor_ThreadSupport" );
        btAssert( iEnd >= iBegin );
        btAssert( grainSize >= 1 );
        int iterationCount = iEnd - iBegin;
        if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() )
        {
            typedef ParallelForJob JobType;
            int jobCount = ( iterationCount + grainSize - 1 ) / grainSize;
            m_numJobs = jobCount;
            btAssert( jobCount >= 2 );  // need more than one job for multithreading
            int jobSize = sizeof( JobType );
            int jobBufSize = jobSize * jobCount;
            // make sure we have enough memory allocated to store jobs
            if ( jobBufSize > m_jobMem.size() )
            {
                m_jobMem.resize( jobBufSize );
            }
            // make sure job queue is big enough
            if ( jobCount > m_jobContext.m_jobQueue.capacity() )
            {
                m_jobContext.m_jobQueue.reserve( jobCount );
            }
            m_jobContext.clearQueue();
            // prepare worker threads for incoming work
            prepareWorkerThreads();
            // submit all of the jobs
            int iJob = 0;
            JobType* jobs = reinterpret_cast<JobType*>( &m_jobMem[ 0 ] );
            for ( int i = iBegin; i < iEnd; i += grainSize )
            {
                btAssert( iJob < jobCount );
                int iE = btMin( i + grainSize, iEnd );
                JobType& job = jobs[ iJob ];
                new ( (void*) &job ) ParallelForJob( i, iE, body );  // placement new
                m_jobContext.submitJob( &job );
                iJob++;
            }
            wakeWorkers( jobCount - 1 );
            // put the main thread to work on emptying the job queue and then wait for all workers to finish
            waitJobs();
            m_antiNestingLock.unlock();
        }
        else
        {
            BT_PROFILE( "parallelFor_mainThread" );
            // just run on main thread
            body.forLoop( iBegin, iEnd );
        }
    }
    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
    {
        BT_PROFILE( "parallelSum_ThreadSupport" );
        btAssert( iEnd >= iBegin );
        btAssert( grainSize >= 1 );
        int iterationCount = iEnd - iBegin;
        if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() )
        {
            typedef ParallelSumJob JobType;
            int jobCount = ( iterationCount + grainSize - 1 ) / grainSize;
            m_numJobs = jobCount;
            btAssert( jobCount >= 2 );  // need more than one job for multithreading
            int jobSize = sizeof( JobType );
            int jobBufSize = jobSize * jobCount;
            // make sure we have enough memory allocated to store jobs
            if ( jobBufSize > m_jobMem.size() )
            {
                m_jobMem.resize( jobBufSize );
            }
            // make sure job queue is big enough
            if ( jobCount > m_jobContext.m_jobQueue.capacity() )
            {
                m_jobContext.m_jobQueue.reserve( jobCount );
            }
            // make sure thread local area is big enough
            int threadLocalSize = m_numThreads * sizeof( ThreadLocalSum );
            if ( threadLocalSize > m_threadLocalMem.size() )
            {
                m_threadLocalMem.resize( threadLocalSize );
            }
            // initialize summation
            ThreadLocalSum* threadLocalSum = reinterpret_cast<ThreadLocalSum*>( &m_threadLocalMem[ 0 ] );
            for ( int iThread = 0; iThread < m_numThreads; ++iThread )
            {
                threadLocalSum[ iThread ].mSum = btScalar( 0 );
            }
            m_jobContext.clearQueue();
            // prepare worker threads for incoming work
            prepareWorkerThreads();
            // submit all of the jobs
            int iJob = 0;
            JobType* jobs = reinterpret_cast<JobType*>( &m_jobMem[ 0 ] );
            for ( int i = iBegin; i < iEnd; i += grainSize )
            {
                btAssert( iJob < jobCount );
                int iE = btMin( i + grainSize, iEnd );
                JobType& job = jobs[ iJob ];
                new ( (void*) &job ) ParallelSumJob( i, iE, body, threadLocalSum );  // placement new
                m_jobContext.submitJob( &job );
                iJob++;
            }
            wakeWorkers( jobCount - 1 );
            // put the main thread to work on emptying the job queue and then wait for all workers to finish
            waitJobs();
            m_antiNestingLock.unlock();
            // add up all the thread sums
            btScalar sum = btScalar(0);
            for ( int iThread = 0; iThread < m_numThreads; ++iThread )
            {
                sum += threadLocalSum[ iThread ].mSum;
            }
            return sum;
        }
        else
        {
            BT_PROFILE( "parallelSum_mainThread" );
            // just run on main thread
            return body.sumLoop( iBegin, iEnd );
        }
    }
 };
 btITaskScheduler* btCreateDefaultTaskScheduler()
 {
    btTaskSchedulerDefault* ts = new btTaskSchedulerDefault();
    ts->init();
    return ts;
 }
 #else // #if BT_THREADSAFE
 btITaskScheduler* btCreateDefaultTaskScheduler()
 {
    return NULL;
 }
 #endif // #else // #if BT_THREADSAFE
--- a/src/LinearMath/TaskScheduler/btThreadSupportInterface.h
+++ b/src/LinearMath/TaskScheduler/btThreadSupportInterface.h
@@ -0,0 +1,75 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2018 Erwin Coumans  http://bulletphysics.com
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose,
 including commercial applications, and to alter it and redistribute it freely,
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #ifndef BT_THREAD_SUPPORT_INTERFACE_H
 #define BT_THREAD_SUPPORT_INTERFACE_H
 class btCriticalSection
 {
 public:
    btCriticalSection() {}
    virtual ~btCriticalSection() {}
    virtual void lock() = 0;
    virtual void unlock() = 0;
 };
 class btThreadSupportInterface
 {
 public:
    virtual ~btThreadSupportInterface() {}
    virtual int getNumWorkerThreads() const = 0;  // number of worker threads (total number of logical processors - 1)
    virtual int getCacheFriendlyNumThreads() const = 0;  // the number of logical processors sharing a single L3 cache
    virtual void runTask( int threadIndex, void* userData ) = 0;
    virtual void waitForAllTasks() = 0;
    virtual btCriticalSection* createCriticalSection() = 0;
    virtual void deleteCriticalSection( btCriticalSection* criticalSection ) = 0;
    virtual void* getThreadLocalMemory( int taskId ) { return NULL; }
    typedef void( *ThreadFunc )( void* userPtr, void* lsMemory );
    typedef void* ( *MemorySetupFunc )( );
    struct ConstructionInfo
    {
        ConstructionInfo( const char* uniqueName,
            ThreadFunc userThreadFunc,
            MemorySetupFunc	lsMemoryFunc,
            int threadStackSize = 65535
        )
            :m_uniqueName( uniqueName ),
            m_userThreadFunc( userThreadFunc ),
            m_lsMemoryFunc( lsMemoryFunc ),
            m_threadStackSize( threadStackSize )
        {
        }
        const char*     m_uniqueName;
        ThreadFunc      m_userThreadFunc;
        MemorySetupFunc m_lsMemoryFunc;
        int             m_threadStackSize;
    };
    static btThreadSupportInterface* create( const ConstructionInfo& info );
 };
 #endif //BT_THREAD_SUPPORT_INTERFACE_H
--- a/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp
+++ b/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp
@@ -0,0 +1,369 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2018 Erwin Coumans  http://bulletphysics.com
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose,
 including commercial applications, and to alter it and redistribute it freely,
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #if BT_THREADSAFE && !defined( _WIN32 )
 #include "LinearMath/btScalar.h"
 #include "LinearMath/btAlignedObjectArray.h"
 #include "LinearMath/btThreads.h"
 #include "LinearMath/btMinMax.h"
 #include "btThreadSupportInterface.h"
 #include <stdio.h>
 #include <errno.h>
 #include <unistd.h>
 #ifndef _XOPEN_SOURCE
 #define _XOPEN_SOURCE 600 //for definition of pthread_barrier_t, see http://pages.cs.wisc.edu/~travitch/pthreads_primer.html
 #endif //_XOPEN_SOURCE
 #include <pthread.h>
 #include <semaphore.h>
 #include <unistd.h>   //for sysconf
 ///
 /// getNumHardwareThreads()
 ///
 ///
 /// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
 ///
 #if __cplusplus >= 201103L
 #include <thread>
 int btGetNumHardwareThreads()
 {
    return std::thread::hardware_concurrency();
 }
 #else
 int btGetNumHardwareThreads()
 {
    return sysconf( _SC_NPROCESSORS_ONLN );
 }
 #endif
 // btThreadSupportPosix helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
 class btThreadSupportPosix : public btThreadSupportInterface
 {
 public:
    struct btThreadStatus
    {
        int m_taskId;
        int m_commandId;
        int m_status;
        ThreadFunc m_userThreadFunc;
        void* m_userPtr; //for taskDesc etc
        void* m_lsMemory; //initialized using PosixLocalStoreMemorySetupFunc
        pthread_t thread;
        //each tread will wait until this signal to start its work
        sem_t* startSemaphore;
        // this is a copy of m_mainSemaphore, 
        //each tread will signal once it is finished with its work
        sem_t* m_mainSemaphore;
        unsigned long threadUsed;
    };
 private:
    typedef unsigned long long UINT64;
    btAlignedObjectArray<btThreadStatus> m_activeThreadStatus;
    // m_mainSemaphoresemaphore will signal, if and how many threads are finished with their work
    sem_t* m_mainSemaphore;
    int m_numThreads;
    UINT64 m_startedThreadsMask;
    void startThreads( const ConstructionInfo& threadInfo );
    void stopThreads();
    int waitForResponse();
 public:
    btThreadSupportPosix( const ConstructionInfo& threadConstructionInfo );
    virtual ~btThreadSupportPosix();
    virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; }
    // TODO: return the number of logical processors sharing the first L3 cache
    virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return m_numThreads + 1; }
    virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE;
    virtual void waitForAllTasks() BT_OVERRIDE;
    virtual btCriticalSection* createCriticalSection() BT_OVERRIDE;
    virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE;
    virtual void* getThreadLocalMemory( int taskId ) BT_OVERRIDE
    {
        return m_activeThreadStatus[ taskId ].m_lsMemory;
    }
 };
 #define checkPThreadFunction(returnValue) \
    if(0 != returnValue) { \
        printf("PThread problem at line %i in file %s: %i %d\n", __LINE__, __FILE__, returnValue, errno); \
    }
 // The number of threads should be equal to the number of available cores
 // Todo: each worker should be linked to a single core, using SetThreadIdealProcessor.
 btThreadSupportPosix::btThreadSupportPosix( const ConstructionInfo& threadConstructionInfo )
 {
    startThreads( threadConstructionInfo );
 }
 // cleanup/shutdown Libspe2
 btThreadSupportPosix::~btThreadSupportPosix()
 {
    stopThreads();
 }
 #if (defined (__APPLE__))
 #define NAMED_SEMAPHORES
 #endif
 static sem_t* createSem( const char* baseName )
 {
    static int semCount = 0;
 #ifdef NAMED_SEMAPHORES
    /// Named semaphore begin
    char name[ 32 ];
    snprintf( name, 32, "/%8.s-%4.d-%4.4d", baseName, getpid(), semCount++ );
    sem_t* tempSem = sem_open( name, O_CREAT, 0600, 0 );
    if ( tempSem != reinterpret_cast<sem_t *>( SEM_FAILED ) )
    {
        //        printf("Created \"%s\" Semaphore %p\n", name, tempSem);
    }
    else
    {
        //printf("Error creating Semaphore %d\n", errno);
        exit( -1 );
    }
    /// Named semaphore end
 #else
    sem_t* tempSem = new sem_t;
    checkPThreadFunction( sem_init( tempSem, 0, 0 ) );
 #endif
    return tempSem;
 }
 static void destroySem( sem_t* semaphore )
 {
 #ifdef NAMED_SEMAPHORES
    checkPThreadFunction( sem_close( semaphore ) );
 #else
    checkPThreadFunction( sem_destroy( semaphore ) );
    delete semaphore;
 #endif
 }
 static void *threadFunction( void *argument )
 {
    btThreadSupportPosix::btThreadStatus* status = ( btThreadSupportPosix::btThreadStatus* )argument;
    while ( 1 )
    {
        checkPThreadFunction( sem_wait( status->startSemaphore ) );
        void* userPtr = status->m_userPtr;
        if ( userPtr )
        {
            btAssert( status->m_status );
            status->m_userThreadFunc( userPtr, status->m_lsMemory );
            status->m_status = 2;
            checkPThreadFunction( sem_post( status->m_mainSemaphore ) );
            status->threadUsed++;
        }
        else
        {
            //exit Thread
            status->m_status = 3;
            checkPThreadFunction( sem_post( status->m_mainSemaphore ) );
            printf( "Thread with taskId %i exiting\n", status->m_taskId );
            break;
        }
    }
    printf( "Thread TERMINATED\n" );
 }
 ///send messages to SPUs
 void btThreadSupportPosix::runTask( int threadIndex, void* userData )
 {
    ///we should spawn an SPU task here, and in 'waitForResponse' it should wait for response of the (one of) the first tasks that finished
    btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ];
    btAssert( threadIndex >= 0 );
    btAssert( threadIndex < m_activeThreadStatus.size() );
    threadStatus.m_commandId = 1;
    threadStatus.m_status = 1;
    threadStatus.m_userPtr = userData;
    m_startedThreadsMask |= UINT64( 1 ) << threadIndex;
    // fire event to start new task
    checkPThreadFunction( sem_post( threadStatus.startSemaphore ) );
 }
 ///check for messages from SPUs
 int btThreadSupportPosix::waitForResponse()
 {
    ///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response
    ///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback'
    btAssert( m_activeThreadStatus.size() );
    // wait for any of the threads to finish
    checkPThreadFunction( sem_wait( m_mainSemaphore ) );
    // get at least one thread which has finished
    size_t last = -1;
    for ( size_t t = 0; t < size_t( m_activeThreadStatus.size() ); ++t )
    {
        if ( 2 == m_activeThreadStatus[ t ].m_status )
        {
            last = t;
            break;
        }
    }
    btThreadStatus& threadStatus = m_activeThreadStatus[ last ];
    btAssert( threadStatus.m_status > 1 );
    threadStatus.m_status = 0;
    // need to find an active spu
    btAssert( last >= 0 );
    m_startedThreadsMask &= ~( UINT64( 1 ) << last );
    return last;
 }
 void btThreadSupportPosix::waitForAllTasks()
 {
    while ( m_startedThreadsMask )
    {
        waitForResponse();
    }
 }
 void btThreadSupportPosix::startThreads( const ConstructionInfo& threadConstructionInfo )
 {
    m_numThreads = btGetNumHardwareThreads() - 1;  // main thread exists already
    printf( "%s creating %i threads.\n", __FUNCTION__, m_numThreads );
    m_activeThreadStatus.resize( m_numThreads );
    m_startedThreadsMask = 0;
    m_mainSemaphore = createSem( "main" );
    //checkPThreadFunction(sem_wait(mainSemaphore));
    for ( int i = 0; i < m_numThreads; i++ )
    {
        printf( "starting thread %d\n", i );
        btThreadStatus& threadStatus = m_activeThreadStatus[ i ];
        threadStatus.startSemaphore = createSem( "threadLocal" );
        checkPThreadFunction( pthread_create( &threadStatus.thread, NULL, &threadFunction, (void*) &threadStatus ) );
        threadStatus.m_userPtr = 0;
        threadStatus.m_taskId = i;
        threadStatus.m_commandId = 0;
        threadStatus.m_status = 0;
        threadStatus.m_mainSemaphore = m_mainSemaphore;
        threadStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
        threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
        threadStatus.threadUsed = 0;
        printf( "started thread %d \n", i );
    }
 }
 ///tell the task scheduler we are done with the SPU tasks
 void btThreadSupportPosix::stopThreads()
 {
    for ( size_t t = 0; t < size_t( m_activeThreadStatus.size() ); ++t )
    {
        btThreadStatus& threadStatus = m_activeThreadStatus[ t ];
        printf( "%s: Thread %i used: %ld\n", __FUNCTION__, int( t ), threadStatus.threadUsed );
        threadStatus.m_userPtr = 0;
        checkPThreadFunction( sem_post( threadStatus.startSemaphore ) );
        checkPThreadFunction( sem_wait( m_mainSemaphore ) );
        printf( "destroy semaphore\n" );
        destroySem( threadStatus.startSemaphore );
        printf( "semaphore destroyed\n" );
        checkPThreadFunction( pthread_join( threadStatus.thread, 0 ) );
    }
    printf( "destroy main semaphore\n" );
    destroySem( m_mainSemaphore );
    printf( "main semaphore destroyed\n" );
    m_activeThreadStatus.clear();
 }
 class btCriticalSectionPosix : public btCriticalSection
 {
    pthread_mutex_t m_mutex;
 public:
    btCriticalSectionPosix()
    {
        pthread_mutex_init( &m_mutex, NULL );
    }
    virtual ~btCriticalSectionPosix()
    {
        pthread_mutex_destroy( &m_mutex );
    }
    virtual void lock()
    {
        pthread_mutex_lock( &m_mutex );
    }
    virtual void unlock()
    {
        pthread_mutex_unlock( &m_mutex );
    }
 };
 btCriticalSection* btThreadSupportPosix::createCriticalSection()
 {
    return new btCriticalSectionPosix();
 }
 void btThreadSupportPosix::deleteCriticalSection( btCriticalSection* cs )
 {
    delete cs;
 }
 btThreadSupportInterface* btThreadSupportInterface::create( const ConstructionInfo& info )
 {
    return new btThreadSupportPosix( info );
 }
 #endif // BT_THREADSAFE && !defined( _WIN32 )
--- a/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp
+++ b/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp
@@ -0,0 +1,480 @@
 /*
 Bullet Continuous Collision Detection and Physics Library
 Copyright (c) 2003-2018 Erwin Coumans  http://bulletphysics.com
 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose,
 including commercial applications, and to alter it and redistribute it freely,
 subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 */
 #if defined( _WIN32 ) &&  BT_THREADSAFE
 #include "LinearMath/btScalar.h"
 #include "LinearMath/btMinMax.h"
 #include "LinearMath/btAlignedObjectArray.h"
 #include "LinearMath/btThreads.h"
 #include "btThreadSupportInterface.h"
 #include <windows.h>
 #include <stdio.h>
 struct btProcessorInfo
 {
    int numLogicalProcessors;
    int numCores;
    int numNumaNodes;
    int numL1Cache;
    int numL2Cache;
    int numL3Cache;
    int numPhysicalPackages;
    static const int maxNumTeamMasks = 32;
    int numTeamMasks;
    UINT64 processorTeamMasks[ maxNumTeamMasks ];
 };
 UINT64 getProcessorTeamMask( const btProcessorInfo& procInfo, int procId )
 {
    UINT64 procMask = UINT64( 1 ) << procId;
    for ( int i = 0; i < procInfo.numTeamMasks; ++i )
    {
        if ( procMask & procInfo.processorTeamMasks[ i ] )
        {
            return procInfo.processorTeamMasks[ i ];
        }
    }
    return 0;
 }
 int getProcessorTeamIndex( const btProcessorInfo& procInfo, int procId )
 {
    UINT64 procMask = UINT64( 1 ) << procId;
    for ( int i = 0; i < procInfo.numTeamMasks; ++i )
    {
        if ( procMask & procInfo.processorTeamMasks[ i ] )
        {
            return i;
        }
    }
    return -1;
 }
 int countSetBits( ULONG64 bits )
 {
    int count = 0;
    while ( bits )
    {
        if ( bits & 1 )
        {
            count++;
        }
        bits >>= 1;
    }
    return count;
 }
 typedef BOOL( WINAPI *Pfn_GetLogicalProcessorInformation )( PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD );
 void getProcessorInformation( btProcessorInfo* procInfo )
 {
    memset( procInfo, 0, sizeof( *procInfo ) );
    Pfn_GetLogicalProcessorInformation getLogicalProcInfo =
        (Pfn_GetLogicalProcessorInformation) GetProcAddress( GetModuleHandle( TEXT( "kernel32" ) ), "GetLogicalProcessorInformation" );
    if ( getLogicalProcInfo == NULL )
    {
        // no info
        return;
    }
    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buf = NULL;
    DWORD bufSize = 0;
    while ( true )
    {
        if ( getLogicalProcInfo( buf, &bufSize ) )
        {
            break;
        }
        else
        {
            if ( GetLastError() == ERROR_INSUFFICIENT_BUFFER )
            {
                if ( buf )
                {
                    free( buf );
                }
                buf = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION) malloc( bufSize );
            }
        }
    }
    int len = bufSize / sizeof( *buf );
    for ( int i = 0; i < len; ++i )
    {
        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION info = buf + i;
        switch ( info->Relationship )
        {
        case RelationNumaNode:
            procInfo->numNumaNodes++;
            break;
        case RelationProcessorCore:
            procInfo->numCores++;
            procInfo->numLogicalProcessors += countSetBits( info->ProcessorMask );
            break;
        case RelationCache:
            if ( info->Cache.Level == 1 )
            {
                procInfo->numL1Cache++;
            }
            else if ( info->Cache.Level == 2 )
            {
                procInfo->numL2Cache++;
            }
            else if ( info->Cache.Level == 3 )
            {
                procInfo->numL3Cache++;
                // processors that share L3 cache are considered to be on the same team
                // because they can more easily work together on the same data.
                // Large performance penalties will occur if 2 or more threads from different
                // teams attempt to frequently read and modify the same cache lines.
                //
                // On the AMD Ryzen 7 CPU for example, the 8 cores on the CPU are split into
                // 2 CCX units of 4 cores each. Each CCX has a separate L3 cache, so if both
                // CCXs are operating on the same data, many cycles will be spent keeping the
                // two caches coherent.
                if ( procInfo->numTeamMasks < btProcessorInfo::maxNumTeamMasks )
                {
                    procInfo->processorTeamMasks[ procInfo->numTeamMasks ] = info->ProcessorMask;
                    procInfo->numTeamMasks++;
                }
            }
            break;
        case RelationProcessorPackage:
            procInfo->numPhysicalPackages++;
            break;
        }
    }
    free( buf );
 }
 ///btThreadSupportWin32 helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
 class btThreadSupportWin32 : public btThreadSupportInterface
 {
 public:
    struct btThreadStatus
    {
        int m_taskId;
        int m_commandId;
        int m_status;
        ThreadFunc m_userThreadFunc;
        void* m_userPtr; //for taskDesc etc
        void* m_lsMemory; //initialized using Win32LocalStoreMemorySetupFunc
        void* m_threadHandle; //this one is calling 'Win32ThreadFunc'
        void* m_eventStartHandle;
        char m_eventStartHandleName[ 32 ];
        void* m_eventCompleteHandle;
        char m_eventCompleteHandleName[ 32 ];
    };
 private:
    btAlignedObjectArray<btThreadStatus> m_activeThreadStatus;
    btAlignedObjectArray<void*> m_completeHandles;
    int m_numThreads;
    DWORD_PTR m_startedThreadMask;
    btProcessorInfo m_processorInfo;
    void startThreads( const ConstructionInfo& threadInfo );
    void stopThreads();
    int waitForResponse();
 public:
    btThreadSupportWin32( const ConstructionInfo& threadConstructionInfo );
    virtual ~btThreadSupportWin32();
    virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; }
    virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return countSetBits(m_processorInfo.processorTeamMasks[0]); }
    virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE;
    virtual void waitForAllTasks() BT_OVERRIDE;
    virtual void* getThreadLocalMemory( int taskId ) BT_OVERRIDE
    {
        return m_activeThreadStatus[ taskId ].m_lsMemory;
    }
    virtual btCriticalSection* createCriticalSection() BT_OVERRIDE;
    virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE;
 };
 btThreadSupportWin32::btThreadSupportWin32( const ConstructionInfo & threadConstructionInfo )
 {
    startThreads( threadConstructionInfo );
 }
 btThreadSupportWin32::~btThreadSupportWin32()
 {
    stopThreads();
 }
 DWORD WINAPI win32threadStartFunc( LPVOID lpParam )
 {
    btThreadSupportWin32::btThreadStatus* status = ( btThreadSupportWin32::btThreadStatus* )lpParam;
    while ( 1 )
    {
        WaitForSingleObject( status->m_eventStartHandle, INFINITE );
        void* userPtr = status->m_userPtr;
        if ( userPtr )
        {
            btAssert( status->m_status );
            status->m_userThreadFunc( userPtr, status->m_lsMemory );
            status->m_status = 2;
            SetEvent( status->m_eventCompleteHandle );
        }
        else
        {
            //exit Thread
            status->m_status = 3;
            printf( "Thread with taskId %i with handle %p exiting\n", status->m_taskId, status->m_threadHandle );
            SetEvent( status->m_eventCompleteHandle );
            break;
        }
    }
    printf( "Thread TERMINATED\n" );
    return 0;
 }
 void btThreadSupportWin32::runTask( int threadIndex, void* userData )
 {
    btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ];
    btAssert( taskId >= 0 );
    btAssert( int( taskId ) < m_activeThreadStatus.size() );
    threadStatus.m_commandId = 1;
    threadStatus.m_status = 1;
    threadStatus.m_userPtr = userData;
    m_startedThreadMask |= DWORD_PTR( 1 ) << threadIndex;
    ///fire event to start new task
    SetEvent( threadStatus.m_eventStartHandle );
 }
 int btThreadSupportWin32::waitForResponse()
 {
    btAssert( m_activeThreadStatus.size() );
    int last = -1;
    DWORD res = WaitForMultipleObjects( m_completeHandles.size(), &m_completeHandles[ 0 ], FALSE, INFINITE );
    btAssert( res != WAIT_FAILED );
    last = res - WAIT_OBJECT_0;
    btThreadStatus& threadStatus = m_activeThreadStatus[ last ];
    btAssert( threadStatus.m_threadHandle );
    btAssert( threadStatus.m_eventCompleteHandle );
    //WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE);
    btAssert( threadStatus.m_status > 1 );
    threadStatus.m_status = 0;
    ///need to find an active spu
    btAssert( last >= 0 );
    m_startedThreadMask &= ~( DWORD_PTR( 1 ) << last );
    return last;
 }
 void btThreadSupportWin32::waitForAllTasks()
 {
    while ( m_startedThreadMask )
    {
        waitForResponse();
    }
 }
 void btThreadSupportWin32::startThreads( const ConstructionInfo& threadConstructionInfo )
 {
    static int uniqueId = 0;
    uniqueId++;
    btProcessorInfo& procInfo = m_processorInfo;
    getProcessorInformation( &procInfo );
    DWORD_PTR dwProcessAffinityMask = 0;
    DWORD_PTR dwSystemAffinityMask = 0;
    if ( !GetProcessAffinityMask( GetCurrentProcess(), &dwProcessAffinityMask, &dwSystemAffinityMask ) )
    {
        dwProcessAffinityMask = 0;
    }
    ///The number of threads should be equal to the number of available cores - 1
    m_numThreads = btMin(procInfo.numLogicalProcessors, int(BT_MAX_THREAD_COUNT)) - 1; // cap to max thread count (-1 because main thread already exists)
    m_activeThreadStatus.resize( m_numThreads );
    m_completeHandles.resize( m_numThreads );
    m_startedThreadMask = 0;
    // set main thread affinity
    if ( DWORD_PTR mask = dwProcessAffinityMask & getProcessorTeamMask( procInfo, 0 ))
    {
        SetThreadAffinityMask( GetCurrentThread(), mask );
        SetThreadIdealProcessor( GetCurrentThread(), 0 );
    }
    for ( int i = 0; i < m_numThreads; i++ )
    {
        printf( "starting thread %d\n", i );
        btThreadStatus& threadStatus = m_activeThreadStatus[ i ];
        LPSECURITY_ATTRIBUTES lpThreadAttributes = NULL;
        SIZE_T dwStackSize = threadConstructionInfo.m_threadStackSize;
        LPTHREAD_START_ROUTINE lpStartAddress = &win32threadStartFunc;
        LPVOID lpParameter = &threadStatus;
        DWORD dwCreationFlags = 0;
        LPDWORD lpThreadId = 0;
        threadStatus.m_userPtr = 0;
        sprintf( threadStatus.m_eventStartHandleName, "es%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i );
        threadStatus.m_eventStartHandle = CreateEventA( 0, false, false, threadStatus.m_eventStartHandleName );
        sprintf( threadStatus.m_eventCompleteHandleName, "ec%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i );
        threadStatus.m_eventCompleteHandle = CreateEventA( 0, false, false, threadStatus.m_eventCompleteHandleName );
        m_completeHandles[ i ] = threadStatus.m_eventCompleteHandle;
        HANDLE handle = CreateThread( lpThreadAttributes, dwStackSize, lpStartAddress, lpParameter, dwCreationFlags, lpThreadId );
        //SetThreadPriority( handle, THREAD_PRIORITY_HIGHEST );
        // highest priority -- can cause erratic performance when numThreads > numCores
        //                     we don't want worker threads to be higher priority than the main thread or the main thread could get
        //                     totally shut out and unable to tell the workers to stop
        //SetThreadPriority( handle, THREAD_PRIORITY_BELOW_NORMAL );
        {
            int processorId = i + 1;  // leave processor 0 for main thread
            DWORD_PTR teamMask = getProcessorTeamMask( procInfo, processorId );
            if ( teamMask )
            {
                // bind each thread to only execute on processors of it's assigned team
                //  - for single-socket Intel x86 CPUs this has no effect (only a single, shared L3 cache so there is only 1 team)
                //  - for multi-socket Intel this will keep threads from migrating from one socket to another
                //  - for AMD Ryzen this will keep threads from migrating from one CCX to another
                DWORD_PTR mask = teamMask & dwProcessAffinityMask;
                if ( mask )
                {
                    SetThreadAffinityMask( handle, mask );
                }
            }
            SetThreadIdealProcessor( handle, processorId );
        }
        threadStatus.m_taskId = i;
        threadStatus.m_commandId = 0;
        threadStatus.m_status = 0;
        threadStatus.m_threadHandle = handle;
        threadStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
        threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
        printf( "started %s thread %d with threadHandle %p\n", threadConstructionInfo.m_uniqueName, i, handle );
    }
 }
 ///tell the task scheduler we are done with the SPU tasks
 void btThreadSupportWin32::stopThreads()
 {
    for ( int i = 0; i < m_activeThreadStatus.size(); i++ )
    {
        btThreadStatus& threadStatus = m_activeThreadStatus[ i ];
        if ( threadStatus.m_status > 0 )
        {
            WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE );
        }
        delete threadStatus.m_lsMemory;
        threadStatus.m_userPtr = 0;
        SetEvent( threadStatus.m_eventStartHandle );
        WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE );
        CloseHandle( threadStatus.m_eventCompleteHandle );
        CloseHandle( threadStatus.m_eventStartHandle );
        CloseHandle( threadStatus.m_threadHandle );
    }
    m_activeThreadStatus.clear();
    m_completeHandles.clear();
 }
 class btWin32CriticalSection : public btCriticalSection
 {
 private:
    CRITICAL_SECTION mCriticalSection;
 public:
    btWin32CriticalSection()
    {
        InitializeCriticalSection( &mCriticalSection );
    }
    ~btWin32CriticalSection()
    {
        DeleteCriticalSection( &mCriticalSection );
    }
    void lock()
    {
        EnterCriticalSection( &mCriticalSection );
    }
    void unlock()
    {
        LeaveCriticalSection( &mCriticalSection );
    }
 };
 btCriticalSection* btThreadSupportWin32::createCriticalSection()
 {
    unsigned char* mem = (unsigned char*) btAlignedAlloc( sizeof( btWin32CriticalSection ), 16 );
    btWin32CriticalSection* cs = new( mem ) btWin32CriticalSection();
    return cs;
 }
 void btThreadSupportWin32::deleteCriticalSection( btCriticalSection* criticalSection )
 {
    criticalSection->~btCriticalSection();
    btAlignedFree( criticalSection );
 }
 btThreadSupportInterface* btThreadSupportInterface::create( const ConstructionInfo& info )
 {
    return new btThreadSupportWin32( info );
 }
 #endif //defined(_WIN32) && BT_THREADSAFE
--- a/src/LinearMath/btThreads.cpp
+++ b/src/LinearMath/btThreads.cpp
@@ -453,6 +453,33 @@ void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBod
 #endif// #if BT_THREADSAFE
 }
 btScalar btParallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body )
 {
 #if BT_THREADSAFE
 #if BT_DETECT_BAD_THREAD_INDEX
    if ( !btThreadsAreRunning() )
    {
        // clear out thread ids
        for ( int i = 0; i < BT_MAX_THREAD_COUNT; ++i )
        {
            gDebugThreadIds[ i ] = kInvalidThreadId;
        }
    }
 #endif // #if BT_DETECT_BAD_THREAD_INDEX
    btAssert( gBtTaskScheduler != NULL );  // call btSetTaskScheduler() with a valid task scheduler first!
    return gBtTaskScheduler->parallelSum( iBegin, iEnd, grainSize, body );
 #else // #if BT_THREADSAFE
    // non-parallel version of btParallelSum
    btAssert( !"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE" );
    return body.sumLoop( iBegin, iEnd );
 #endif //#else // #if BT_THREADSAFE
 }
 ///
 /// btTaskSchedulerSequential -- non-threaded implementation of task scheduler
@@ -470,6 +497,11 @@ public:
        BT_PROFILE( "parallelFor_sequential" );
        body.forLoop( iBegin, iEnd );
    }
    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
    {
        BT_PROFILE( "parallelSum_sequential" );
        return body.sumLoop( iBegin, iEnd );
    }
 };
@@ -514,11 +546,25 @@ public:
 #pragma omp parallel for schedule( static, 1 )
        for ( int i = iBegin; i < iEnd; i += grainSize )
        {
-            BT_PROFILE( "OpenMP_job" );
+            BT_PROFILE( "OpenMP_forJob" );
            body.forLoop( i, ( std::min )( i + grainSize, iEnd ) );
        }
        btPopThreadsAreRunning();
    }
    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
    {
        BT_PROFILE( "parallelFor_OpenMP" );
        btPushThreadsAreRunning();
        btScalar sum = btScalar( 0 );
 #pragma omp parallel for schedule( static, 1 ) reduction(+:sum)
        for ( int i = iBegin; i < iEnd; i += grainSize )
        {
            BT_PROFILE( "OpenMP_sumJob" );
            sum += body.sumLoop( i, ( std::min )( i + grainSize, iEnd ) );
        }
        btPopThreadsAreRunning();
        return sum;
    }
 };
 #endif // #if BT_USE_OPENMP && BT_THREADSAFE
@@ -571,22 +617,21 @@ public:
            btResetThreadIndexCounter();
        }
    }
-    struct BodyAdapter
+    struct ForBodyAdapter
    {
        const btIParallelForBody* mBody;
        ForBodyAdapter( const btIParallelForBody* body ) : mBody( body ) {}
        void operator()( const tbb::blocked_range<int>& range ) const
        {
-            BT_PROFILE( "TBB_job" );
+            BT_PROFILE( "TBB_forJob" );
            mBody->forLoop( range.begin(), range.end() );
        }
    };
    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
    {
        BT_PROFILE( "parallelFor_TBB" );
-        // TBB dispatch
+        ForBodyAdapter tbbBody( &body );
        BodyAdapter tbbBody;
        tbbBody.mBody = &body;
        btPushThreadsAreRunning();
        tbb::parallel_for( tbb::blocked_range<int>( iBegin, iEnd, grainSize ),
            tbbBody,
@@ -594,6 +639,29 @@ public:
        );
        btPopThreadsAreRunning();
    }
    struct SumBodyAdapter
    {
        const btIParallelSumBody* mBody;
        btScalar mSum;
        SumBodyAdapter( const btIParallelSumBody* body ) : mBody( body ), mSum( btScalar( 0 ) ) {}
        SumBodyAdapter( const SumBodyAdapter& src, tbb::split ) : mBody( src.mBody ), mSum( btScalar( 0 ) ) {}
        void join( const SumBodyAdapter& src ) { mSum += src.mSum; }
        void operator()( const tbb::blocked_range<int>& range )
        {
            BT_PROFILE( "TBB_sumJob" );
            mSum += mBody->sumLoop( range.begin(), range.end() );
        }
    };
    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
    {
        BT_PROFILE( "parallelSum_TBB" );
        SumBodyAdapter tbbBody( &body );
        btPushThreadsAreRunning();
        tbb::parallel_deterministic_reduce( tbb::blocked_range<int>( iBegin, iEnd, grainSize ), tbbBody );
        btPopThreadsAreRunning();
        return tbbBody.mSum;
    }
 };
 #endif // #if BT_USE_TBB && BT_THREADSAFE
@@ -605,6 +673,7 @@ public:
 class btTaskSchedulerPPL : public btITaskScheduler
 {
    int m_numThreads;
    concurrency::combinable<btScalar> m_sum;  // for parallelSum
 public:
    btTaskSchedulerPPL() : btITaskScheduler( "PPL" )
    {
@@ -644,15 +713,16 @@ public:
            btResetThreadIndexCounter();
        }
    }
-    struct BodyAdapter
+    struct ForBodyAdapter
    {
        const btIParallelForBody* mBody;
        int mGrainSize;
        int mIndexEnd;
        ForBodyAdapter( const btIParallelForBody* body, int grainSize, int end ) : mBody( body ), mGrainSize( grainSize ), mIndexEnd( end ) {}
        void operator()( int i ) const
        {
-            BT_PROFILE( "PPL_job" );
+            BT_PROFILE( "PPL_forJob" );
            mBody->forLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) );
        }
    };
@@ -660,10 +730,7 @@ public:
    {
        BT_PROFILE( "parallelFor_PPL" );
        // PPL dispatch
-        BodyAdapter pplBody;
+        ForBodyAdapter pplBody( &body, grainSize, iEnd );
        pplBody.mBody = &body;
        pplBody.mGrainSize = grainSize;
        pplBody.mIndexEnd = iEnd;
        btPushThreadsAreRunning();
        // note: MSVC 2010 doesn't support partitioner args, so avoid them
        concurrency::parallel_for( iBegin,
@@ -673,6 +740,36 @@ public:
        );
        btPopThreadsAreRunning();
    }
    struct SumBodyAdapter
    {
        const btIParallelSumBody* mBody;
        concurrency::combinable<btScalar>* mSum;
        int mGrainSize;
        int mIndexEnd;
        SumBodyAdapter( const btIParallelSumBody* body, concurrency::combinable<btScalar>* sum, int grainSize, int end ) : mBody( body ), mSum(sum), mGrainSize( grainSize ), mIndexEnd( end ) {}
        void operator()( int i ) const
        {
            BT_PROFILE( "PPL_sumJob" );
            mSum->local() += mBody->sumLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) );
        }
    };
    static btScalar sumFunc( btScalar a, btScalar b ) { return a + b; }
    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE
    {
        BT_PROFILE( "parallelSum_PPL" );
        m_sum.clear();
        SumBodyAdapter pplBody( &body, &m_sum, grainSize, iEnd );
        btPushThreadsAreRunning();
        // note: MSVC 2010 doesn't support partitioner args, so avoid them
        concurrency::parallel_for( iBegin,
            iEnd,
            grainSize,
            pplBody
        );
        btPopThreadsAreRunning();
        return m_sum.combine( sumFunc );
    }
 };
 #endif // #if BT_USE_PPL && BT_THREADSAFE
--- a/src/LinearMath/btThreads.h
+++ b/src/LinearMath/btThreads.h
@@ -107,6 +107,17 @@ public:
    virtual void forLoop( int iBegin, int iEnd ) const = 0;
 };
 //
 // btIParallelSumBody -- subclass this to express work that can be done in parallel
 //                       and produces a sum over all loop elements
 //
 class btIParallelSumBody
 {
 public:
    virtual ~btIParallelSumBody() {}
    virtual btScalar sumLoop( int iBegin, int iEnd ) const = 0;
 };
 //
 // btITaskScheduler -- subclass this to implement a task scheduler that can dispatch work to
 //                     worker threads
@@ -122,6 +133,8 @@ public:
    virtual int getNumThreads() const = 0;
    virtual void setNumThreads( int numThreads ) = 0;
    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) = 0;
    virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) = 0;
    virtual void sleepWorkerThreadsHint() {}  // hint the task scheduler that we may not be using these threads for a little while
    // internal use only
    virtual void activate();
@@ -143,6 +156,9 @@ btITaskScheduler* btGetTaskScheduler();
 // get non-threaded task scheduler (always available)
 btITaskScheduler* btGetSequentialTaskScheduler();
 // create a default task scheduler (Win32 or pthreads based)
 btITaskScheduler* btCreateDefaultTaskScheduler();
 // get OpenMP task scheduler (if available, otherwise returns null)
 btITaskScheduler* btGetOpenMPTaskScheduler();
@@ -156,5 +172,9 @@ btITaskScheduler* btGetPPLTaskScheduler();
 //                 (iterations may be done out of order, so no dependencies are allowed)
 void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body );
 // btParallelSum -- call this to dispatch work like a for-loop, returns the sum of all iterations
 //                 (iterations may be done out of order, so no dependencies are allowed)
 btScalar btParallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body );
 #endif