parallel solver: various changes

- threading: adding btSequentialImpulseConstraintSolverMt - task scheduler: added parallelSum so that parallel solver can compute residuals - CommonRigidBodyMTBase: add slider for solver least squares residual and allow multithreading without needing OpenMP, TBB, or PPL - taskScheduler: don't wait for workers to sleep/signal at the end of each parallel block - parallel solver: convertContacts split into an allocContactConstraints and setupContactConstraints stage, the latter of which is done in parallel - parallel solver: rolling friction is now interleaved along with normal friction - parallel solver: batchified split impulse solving + some cleanup - parallel solver: sorting batches from largest to smallest - parallel solver: added parallel batch creation - parallel solver: added warmstartingWriteBackContacts func + other cleanup - task scheduler: truncate low bits to preserve determinism with parallelSum - parallel solver: reducing dynamic mem allocs and trying to parallelize more of the batch setup - parallel solver: parallelize updating constraint batch ids for merging - parallel solver: adding debug visualization - task scheduler: make TBB task scheduler parallelSum deterministic - parallel solver: split batch gen code into separate file; allow selection of batch gen method - task scheduler: add sleepWorkerThreadsHint() at end of simulation - parallel solver: added grain size per phase - task Scheduler: fix for strange threading issue; also no need for main thread to wait for workers to sleep - base constraint solver: break out joint setup into separate function for profiling/overriding - parallel solver: allow different batching method for contacts vs joints - base constraint solver: add convertJoint and convertBodies to make it possible to parallelize joint and body conversion - parallel solver: convert joints and bodies in parallel now - parallel solver: speed up batch creation with run-length encoding - parallel solver: batch gen: run-length expansion in parallel; collect constraint info in parallel - parallel solver: adding spatial grid batching method - parallel solver: enhancements to spatial grid batching - sequential solver: moving code for writing back into functions that derived classes can call - parallel solver: do write back of bodies and joints in parallel - parallel solver: removed all batching methods except for spatial grid (others were ineffective) - parallel solver: added 2D or 3D grid batching options; and a bit of cleanup - move btDefaultTaskScheduler into LinearMath project
2017-06-04 17:57:25 -07:00
parent 94bc897067
commit b8720f2161
25 changed files with 5236 additions and 767 deletions
--- a/examples/ExampleBrowser/CMakeLists.txt
+++ b/examples/ExampleBrowser/CMakeLists.txt
@@ -226,7 +226,6 @@ SET(BulletExampleBrowser_SRCS
 	../MultiThreading/b3PosixThreadSupport.cpp
 	../MultiThreading/b3Win32ThreadSupport.cpp
 	../MultiThreading/b3ThreadSupportInterface.cpp
-	../MultiThreading/btTaskScheduler.cpp
 	../RenderingExamples/TinyRendererSetup.cpp
 	../RenderingExamples/TimeSeriesCanvas.cpp
 	../RenderingExamples/TimeSeriesCanvas.h
--- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
+++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp
@@ -29,17 +29,17 @@ class btCollisionShape;
 #include "BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h"
 #include "BulletDynamics/Dynamics/btSimulationIslandManagerMt.h"  // for setSplitIslands()
 #include "BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h"
+#include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h"
 #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h"
 #include "BulletDynamics/ConstraintSolver/btNNCGConstraintSolver.h"
 #include "BulletDynamics/MLCPSolvers/btMLCPSolver.h"
 #include "BulletDynamics/MLCPSolvers/btSolveProjectedGaussSeidel.h"
 #include "BulletDynamics/MLCPSolvers/btDantzigSolver.h"
 #include "BulletDynamics/MLCPSolvers/btLemkeSolver.h"
-#include "../MultiThreading/btTaskScheduler.h"


 static int gNumIslands = 0;
-
+bool gAllowNestedParallelForLoops = false;

 class Profiler
 {
@@ -52,6 +52,10 @@ public:
        kRecordPredictUnconstrainedMotion,
        kRecordCreatePredictiveContacts,
        kRecordIntegrateTransforms,
+        kRecordSolverTotal,
+        kRecordSolverSetup,
+        kRecordSolverIterations,
+        kRecordSolverFinish,
        kRecordCount
    };

@@ -139,6 +143,41 @@ static void profileEndCallback( btDynamicsWorld *world, btScalar timeStep )
 }


+class MySequentialImpulseConstraintSolverMt : public btSequentialImpulseConstraintSolverMt
+{
+    typedef btSequentialImpulseConstraintSolverMt ParentClass;
+public:
+    BT_DECLARE_ALIGNED_ALLOCATOR();
+	
+	MySequentialImpulseConstraintSolverMt() {}
+
+    // for profiling
+	virtual btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE
+    {
+        ProfileHelper prof(Profiler::kRecordSolverSetup);
+        btScalar ret = ParentClass::solveGroupCacheFriendlySetup(bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer );
+        return ret;
+    }
+    virtual btScalar solveGroupCacheFriendlyIterations( btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& infoGlobal, btIDebugDraw* debugDrawer ) BT_OVERRIDE
+    {
+        ProfileHelper prof(Profiler::kRecordSolverIterations);
+        btScalar ret = ParentClass::solveGroupCacheFriendlyIterations(bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer );
+        return ret;
+    }
+    virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal) BT_OVERRIDE
+    {
+        ProfileHelper prof(Profiler::kRecordSolverFinish);
+        btScalar ret = ParentClass::solveGroupCacheFriendlyFinish(bodies, numBodies, infoGlobal);
+        return ret;
+    }
+    virtual btScalar solveGroup(btCollisionObject** bodies, int numBodies, btPersistentManifold** manifold, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btDispatcher* dispatcher) BT_OVERRIDE
+    {
+        ProfileHelper prof(Profiler::kRecordSolverTotal);
+        btScalar ret = ParentClass::solveGroup(bodies, numBodies, manifold, numManifolds, constraints, numConstraints, info, debugDrawer, dispatcher);
+        return ret;
+    }
+};
+
 ///
 /// MyCollisionDispatcher -- subclassed for profiling purposes
 ///
@@ -218,6 +257,8 @@ btConstraintSolver* createSolverByType( SolverType t )
    {
    case SOLVER_TYPE_SEQUENTIAL_IMPULSE:
        return new btSequentialImpulseConstraintSolver();
+    case SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT:
+        return new MySequentialImpulseConstraintSolverMt();
    case SOLVER_TYPE_NNCG:
        return new btNNCGConstraintSolver();
    case SOLVER_TYPE_MLCP_PGS:
@@ -253,7 +294,7 @@ public:
    {
        addTaskScheduler( btGetSequentialTaskScheduler() );
 #if BT_THREADSAFE
-        if ( btITaskScheduler* ts = createDefaultTaskScheduler() )
+        if ( btITaskScheduler* ts = btCreateDefaultTaskScheduler() )
        {
            m_allocatedTaskSchedulers.push_back( ts );
            addTaskScheduler( ts );
@@ -310,7 +351,7 @@ static bool gDisplayProfileInfo = true;
 static bool gMultithreadedWorld = false;
 static bool gDisplayProfileInfo = false;
 #endif
-static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE;
+static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT;
 static int gSolverMode = SOLVER_SIMD |
                        SOLVER_USE_WARMSTARTING |
                        // SOLVER_RANDMIZE_ORDER |
@@ -318,9 +359,11 @@ static int gSolverMode = SOLVER_SIMD |
                        // SOLVER_USE_2_FRICTION_DIRECTIONS |
                        0;
 static btScalar gSliderSolverIterations = 10.0f; // should be int
-
 static btScalar gSliderNumThreads = 1.0f;  // should be int
-
+static btScalar gSliderIslandBatchingThreshold = 0.0f; // should be int
+static btScalar gSliderMinBatchSize = btScalar(btSequentialImpulseConstraintSolverMt::s_minBatchSize); // should be int
+static btScalar gSliderMaxBatchSize = btScalar(btSequentialImpulseConstraintSolverMt::s_maxBatchSize); // should be int
+static btScalar gSliderLeastSquaresResidualThreshold = 0.0f;

 ////////////////////////////////////
 CommonRigidBodyMTBase::CommonRigidBodyMTBase( struct GUIHelperInterface* helper )
@@ -419,6 +462,23 @@ void setTaskSchedulerComboBoxCallback(int combobox, const char* item, void* user
 }


+void setBatchingMethodComboBoxCallback(int combobox, const char* item, void* userPointer)
+{
+#if BT_THREADSAFE
+    const char** items = static_cast<const char**>( userPointer );
+    for ( int i = 0; i < btBatchedConstraints::BATCHING_METHOD_COUNT; ++i )
+    {
+        if ( strcmp( item, items[ i ] ) == 0 )
+        {
+            // change the task scheduler
+            btSequentialImpulseConstraintSolverMt::s_contactBatchingMethod = static_cast<btBatchedConstraints::BatchingMethod>( i );
+            break;
+        }
+    }
+#endif // #if BT_THREADSAFE
+}
+
+
 static void setThreadCountCallback(float val, void* userPtr)
 {
 #if BT_THREADSAFE
@@ -435,13 +495,43 @@ static void setSolverIterationCountCallback(float val, void* userPtr)
    }
 }

+static void setLargeIslandManifoldCountCallback( float val, void* userPtr )
+{
+    btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching = int( gSliderIslandBatchingThreshold );
+}
+
+static void setMinBatchSizeCallback( float val, void* userPtr )
+{
+    gSliderMaxBatchSize = (std::max)(gSliderMinBatchSize, gSliderMaxBatchSize);
+    btSequentialImpulseConstraintSolverMt::s_minBatchSize = int(gSliderMinBatchSize);
+    btSequentialImpulseConstraintSolverMt::s_maxBatchSize = int(gSliderMaxBatchSize);
+}
+
+static void setMaxBatchSizeCallback( float val, void* userPtr )
+{
+    gSliderMinBatchSize = (std::min)(gSliderMinBatchSize, gSliderMaxBatchSize);
+    btSequentialImpulseConstraintSolverMt::s_minBatchSize = int(gSliderMinBatchSize);
+    btSequentialImpulseConstraintSolverMt::s_maxBatchSize = int(gSliderMaxBatchSize);
+}
+
+static void setLeastSquaresResidualThresholdCallback( float val, void* userPtr )
+{
+    if (btDiscreteDynamicsWorld* world = reinterpret_cast<btDiscreteDynamicsWorld*>(userPtr))
+    {
+        world->getSolverInfo().m_leastSquaresResidualThreshold = gSliderLeastSquaresResidualThreshold;
+    }
+}
+
 void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
 {
    gNumIslands = 0;
    m_solverType = gSolverType;
-#if BT_THREADSAFE && (BT_USE_OPENMP || BT_USE_PPL || BT_USE_TBB)
+#if BT_THREADSAFE
    btAssert( btGetTaskScheduler() != NULL );
-    m_multithreadCapable = true;
+    if (NULL != btGetTaskScheduler() && gTaskSchedulerMgr.getNumTaskSchedulers() > 1)
+    {
+        m_multithreadCapable = true;
+    }
 #endif
    if ( gMultithreadedWorld )
    {
@@ -486,7 +576,12 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()

        m_broadphase = new btDbvtBroadphase();

-        m_solver = createSolverByType( m_solverType );
+        SolverType solverType = m_solverType;
+        if ( solverType == SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT )
+        {
+            solverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE;
+        }
+        m_solver = createSolverByType( solverType );

        m_dynamicsWorld = new btDiscreteDynamicsWorld( m_dispatcher, m_broadphase, m_solver, m_collisionConfiguration );
    }
@@ -494,6 +589,7 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld()
    m_dynamicsWorld->setInternalTickCallback( profileEndCallback, NULL, false );
    m_dynamicsWorld->setGravity( btVector3( 0, -10, 0 ) );
    m_dynamicsWorld->getSolverInfo().m_solverMode = gSolverMode;
+   	m_dynamicsWorld->getSolverInfo().m_numIterations = btMax(1, int(gSliderSolverIterations));
    createDefaultParameters();
 }

@@ -504,16 +600,18 @@ void CommonRigidBodyMTBase::createDefaultParameters()
    {
        // create a button to toggle multithreaded world
        ButtonParams button( "Multithreaded world enable", 0, true );
-        button.m_initialState = gMultithreadedWorld;
-        button.m_userPointer = &gMultithreadedWorld;
+        bool* ptr = &gMultithreadedWorld;
+        button.m_initialState = *ptr;
+        button.m_userPointer = ptr;
        button.m_callback = boolPtrButtonCallback;
        m_guiHelper->getParameterInterface()->registerButtonParameter( button );
    }
    {
        // create a button to toggle profile printing
        ButtonParams button( "Display solver info", 0, true );
-        button.m_initialState = gDisplayProfileInfo;
-        button.m_userPointer = &gDisplayProfileInfo;
+        bool* ptr = &gDisplayProfileInfo;
+        button.m_initialState = *ptr;
+        button.m_userPointer = ptr;
        button.m_callback = boolPtrButtonCallback;
        m_guiHelper->getParameterInterface()->registerButtonParameter( button );
    }
@@ -544,6 +642,16 @@ void CommonRigidBodyMTBase::createDefaultParameters()
        slider.m_clampToIntegers = true;
        m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
    }
+    {
+        // a slider for the solver leastSquaresResidualThreshold (used to run fewer solver iterations when convergence is good)
+        SliderParams slider( "Solver residual thresh", &gSliderLeastSquaresResidualThreshold );
+        slider.m_minVal = 0.0f;
+        slider.m_maxVal = 0.25f;
+        slider.m_callback = setLeastSquaresResidualThresholdCallback;
+        slider.m_userPointer = m_dynamicsWorld;
+        slider.m_clampToIntegers = false;
+        m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
+    }
    {
        ButtonParams button( "Solver use SIMD", 0, true );
        button.m_buttonId = SOLVER_SIMD;
@@ -618,20 +726,86 @@ void CommonRigidBodyMTBase::createDefaultParameters()
            m_guiHelper->getParameterInterface()->registerComboBox( comboParams );
        }
        {
-            // create a slider to set the number of threads to use
-            int numThreads = btGetTaskScheduler()->getNumThreads();
            // if slider has not been set yet (by another demo),
            if ( gSliderNumThreads <= 1.0f )
            {
+                // create a slider to set the number of threads to use
+                int numThreads = btGetTaskScheduler()->getNumThreads();
                gSliderNumThreads = float( numThreads );
            }
+            int maxNumThreads = btGetTaskScheduler()->getMaxNumThreads();
 			SliderParams slider("Thread count", &gSliderNumThreads);
 			slider.m_minVal = 1.0f;
-			slider.m_maxVal = float( BT_MAX_THREAD_COUNT );
+			slider.m_maxVal = float( maxNumThreads );
 			slider.m_callback = setThreadCountCallback;
            slider.m_clampToIntegers = true;
            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
        }
+        {
+            // a slider for the number of manifolds an island needs to be too large for parallel dispatch
+            if (gSliderIslandBatchingThreshold < 1.0)
+            {
+                gSliderIslandBatchingThreshold = float(btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching);
+            }
+            SliderParams slider( "IslandBatchThresh", &gSliderIslandBatchingThreshold );
+            slider.m_minVal = 1.0f;
+            slider.m_maxVal = 2000.0f;
+            slider.m_callback = setLargeIslandManifoldCountCallback;
+            slider.m_userPointer = NULL;
+            slider.m_clampToIntegers = true;
+            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
+        }
+        {
+            // create a combo box for selecting the batching method
+            static const char* sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_COUNT ];
+            {
+                sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_SPATIAL_GRID_2D ] = "Batching: 2D Grid";
+                sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_SPATIAL_GRID_3D ] = "Batching: 3D Grid";
+            };
+            ComboBoxParams comboParams;
+            comboParams.m_userPointer = sBatchingMethodComboBoxItems;
+            comboParams.m_numItems = btBatchedConstraints::BATCHING_METHOD_COUNT;
+            comboParams.m_startItem = static_cast<int>(btSequentialImpulseConstraintSolverMt::s_contactBatchingMethod);
+            comboParams.m_items = sBatchingMethodComboBoxItems;
+            comboParams.m_callback = setBatchingMethodComboBoxCallback;
+            m_guiHelper->getParameterInterface()->registerComboBox( comboParams );
+        }
+        {
+            // a slider for the sequentialImpulseConstraintSolverMt min batch size (when batching)
+            SliderParams slider( "Min batch size", &gSliderMinBatchSize );
+            slider.m_minVal = 1.0f;
+            slider.m_maxVal = 1000.0f;
+            slider.m_callback = setMinBatchSizeCallback;
+            slider.m_userPointer = NULL;
+            slider.m_clampToIntegers = true;
+            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
+        }
+        {
+            // a slider for the sequentialImpulseConstraintSolverMt max batch size (when batching)
+            SliderParams slider( "Max batch size", &gSliderMaxBatchSize );
+            slider.m_minVal = 1.0f;
+            slider.m_maxVal = 1000.0f;
+            slider.m_callback = setMaxBatchSizeCallback;
+            slider.m_userPointer = NULL;
+            slider.m_clampToIntegers = true;
+            m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider );
+        }
+        {
+            // create a button to toggle debug drawing of batching visualization
+            ButtonParams button( "Visualize batching", 0, true );
+            bool* ptr = &btBatchedConstraints::s_debugDrawBatches;
+            button.m_initialState = *ptr;
+            button.m_userPointer = ptr;
+            button.m_callback = boolPtrButtonCallback;
+            m_guiHelper->getParameterInterface()->registerButtonParameter( button );
+        }
+        {
+            ButtonParams button( "Allow Nested ParallelFor", 0, true );
+            button.m_initialState = btSequentialImpulseConstraintSolverMt::s_allowNestedParallelForLoops;
+            button.m_userPointer = &btSequentialImpulseConstraintSolverMt::s_allowNestedParallelForLoops;
+            button.m_callback = boolPtrButtonCallback;
+            m_guiHelper->getParameterInterface()->registerButtonParameter( button );
+        }
 #endif // #if BT_THREADSAFE
    }
 }
@@ -643,6 +817,7 @@ void CommonRigidBodyMTBase::drawScreenText()
    int xCoord = 400;
    int yCoord = 30;
    int yStep = 30;
+    int indent = 30;
    if (m_solverType != gSolverType)
    {
        sprintf( msg, "restart example to change solver type" );
@@ -721,6 +896,34 @@ void CommonRigidBodyMTBase::drawScreenText()
            m_guiHelper->getAppInterface()->drawText( msg, xCoord, yCoord, 0.4f );
            yCoord += yStep;

+            sprintf( msg,
+                     "SolverTotal %5.3f ms",
+                     gProfiler.getAverageTime( Profiler::kRecordSolverTotal )*0.001f
+                     );
+            m_guiHelper->getAppInterface()->drawText( msg, xCoord, yCoord, 0.4f );
+            yCoord += yStep;
+
+            sprintf( msg,
+                     "SolverSetup %5.3f ms",
+                     gProfiler.getAverageTime( Profiler::kRecordSolverSetup )*0.001f
+                     );
+            m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f );
+            yCoord += yStep;
+
+            sprintf( msg,
+                     "SolverIterations %5.3f ms",
+                     gProfiler.getAverageTime( Profiler::kRecordSolverIterations )*0.001f
+                     );
+            m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f );
+            yCoord += yStep;
+
+            sprintf( msg,
+                     "SolverFinish %5.3f ms",
+                     gProfiler.getAverageTime( Profiler::kRecordSolverFinish )*0.001f
+                     );
+            m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f );
+            yCoord += yStep;
+
            sprintf( msg,
                     "PredictUnconstrainedMotion %5.3f ms",
                     gProfiler.getAverageTime( Profiler::kRecordPredictUnconstrainedMotion )*0.001f
--- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.h
+++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.h
@@ -14,6 +14,7 @@
 enum SolverType
 {
    SOLVER_TYPE_SEQUENTIAL_IMPULSE,
+    SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT,
    SOLVER_TYPE_NNCG,
    SOLVER_TYPE_MLCP_PGS,
    SOLVER_TYPE_MLCP_DANTZIG,
@@ -27,6 +28,7 @@ inline const char* getSolverTypeName( SolverType t )
    switch (t)
    {
    case SOLVER_TYPE_SEQUENTIAL_IMPULSE: return "SequentialImpulse";
+    case SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT: return "SequentialImpulseMt";
    case SOLVER_TYPE_NNCG: return "NNCG";
    case SOLVER_TYPE_MLCP_PGS: return "MLCP ProjectedGaussSeidel";
    case SOLVER_TYPE_MLCP_DANTZIG: return "MLCP Dantzig";
--- a/examples/MultiThreadedDemo/MultiThreadedDemo.cpp
+++ b/examples/MultiThreadedDemo/MultiThreadedDemo.cpp
@@ -25,10 +25,10 @@ subject to the following restrictions:



-static btScalar gSliderStackRows = 8.0f;
-static btScalar gSliderStackColumns = 6.0f;
-static btScalar gSliderStackHeight = 10.0f;
-static btScalar gSliderStackWidth = 1.0f;
+static btScalar gSliderStackRows = 1.0f;
+static btScalar gSliderStackColumns = 1.0f;
+static btScalar gSliderStackHeight = 15.0f;
+static btScalar gSliderStackWidth = 8.0f;
 static btScalar gSliderGroundHorizontalAmplitude = 0.0f;
 static btScalar gSliderGroundVerticalAmplitude = 0.0f;
 static btScalar gSliderGroundTilt = 0.0f;
@@ -75,6 +75,21 @@ public:
        btScalar tilt = gSliderGroundTilt * SIMD_2_PI / 360.0f;
        return btQuaternion( btVector3( 1.0f, 0.0f, 0.0f ), tilt );
    }
+    struct TestSumBody : public btIParallelSumBody
+    {
+        virtual btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE
+        {
+            btScalar sum = 0.0f;
+            for (int i = iBegin; i < iEnd; ++i)
+            {
+                if (i > 0)
+                {
+                    sum += 1.0f / btScalar(i);
+                }
+            }
+            return sum;
+        }
+    };
    virtual void stepSimulation( float deltaTime ) BT_OVERRIDE
    {
        if ( m_dynamicsWorld )
@@ -115,6 +130,14 @@ public:
            // always step by 1/60 for benchmarking
            m_dynamicsWorld->stepSimulation( 1.0f / 60.0f, 0 );
        }
+#if 0
+        {
+            // test parallelSum
+            TestSumBody testSumBody;
+            float testSum = btParallelSum( 1, 10000000, 10000, testSumBody );
+            printf( "sum = %f\n", testSum );
+        }
+#endif
    }

    virtual void initPhysics() BT_OVERRIDE;
--- a/examples/MultiThreading/btTaskScheduler.cpp
+++ b/examples/MultiThreading/btTaskScheduler.cpp
@@ -1,448 +0,0 @@
-
-#include "LinearMath/btTransform.h"
-#include "../Utils/b3Clock.h"
-#include "LinearMath/btAlignedObjectArray.h"
-#include "LinearMath/btThreads.h"
-#include "LinearMath/btQuickprof.h"
-#include <stdio.h>
-#include <algorithm>
-
-
-typedef void( *btThreadFunc )( void* userPtr, void* lsMemory );
-typedef void* ( *btThreadLocalStorageFunc )();
-
-#if BT_THREADSAFE
-
-#if defined( _WIN32 )
-
-#include "b3Win32ThreadSupport.h"
-
-b3ThreadSupportInterface* createThreadSupport( int numThreads, btThreadFunc threadFunc, btThreadLocalStorageFunc localStoreFunc, const char* uniqueName )
-{
-    b3Win32ThreadSupport::Win32ThreadConstructionInfo constructionInfo( uniqueName, threadFunc, localStoreFunc, numThreads );
-    //constructionInfo.m_priority = 0;  // highest priority (the default) -- can cause erratic performance when numThreads > numCores
-    //                                     we don't want worker threads to be higher priority than the main thread or the main thread could get
-    //                                     totally shut out and unable to tell the workers to stop
-    constructionInfo.m_priority = -1;  // normal priority
-    b3Win32ThreadSupport* threadSupport = new b3Win32ThreadSupport( constructionInfo );
-    return threadSupport;
-}
-
-#else // #if defined( _WIN32 )
-
-#include "b3PosixThreadSupport.h"
-
-b3ThreadSupportInterface* createThreadSupport( int numThreads, btThreadFunc threadFunc, btThreadLocalStorageFunc localStoreFunc, const char* uniqueName)
-{
-    b3PosixThreadSupport::ThreadConstructionInfo constructionInfo( uniqueName, threadFunc, localStoreFunc, numThreads );
-    b3ThreadSupportInterface* threadSupport = new b3PosixThreadSupport( constructionInfo );
-    return threadSupport;
-}
-
-#endif // #else // #if defined( _WIN32 )
-
-
-///
-/// getNumHardwareThreads()
-///
-///
-/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
-///
-#if __cplusplus >= 201103L
-
-#include <thread>
-
-int getNumHardwareThreads()
-{
-    return std::thread::hardware_concurrency();
-}
-
-#elif defined( _WIN32 )
-
-#define WIN32_LEAN_AND_MEAN
-
-#include <windows.h>
-
-int getNumHardwareThreads()
-{
-    // caps out at 32
-    SYSTEM_INFO info;
-    GetSystemInfo( &info );
-    return info.dwNumberOfProcessors;
-}
-
-#else
-
-int getNumHardwareThreads()
-{
-    return 0;  // don't know
-}
-
-#endif
-
-
-struct WorkerThreadStatus
-{
-    enum Type
-    {
-        kInvalid,
-        kWaitingForWork,
-        kWorking,
-        kSleeping,
-    };
-};
-
-
-struct IJob
-{
-    virtual void executeJob() = 0;
-};
-
-class ParallelForJob : public IJob
-{
-    const btIParallelForBody* mBody;
-    int mBegin;
-    int mEnd;
-
-public:
-    ParallelForJob()
-    {
-        mBody = NULL;
-        mBegin = 0;
-        mEnd = 0;
-    }
-    void init( int iBegin, int iEnd, const btIParallelForBody& body )
-    {
-        mBody = &body;
-        mBegin = iBegin;
-        mEnd = iEnd;
-    }
-    virtual void executeJob() BT_OVERRIDE
-    {
-        BT_PROFILE( "executeJob" );
-
-        // call the functor body to do the work
-        mBody->forLoop( mBegin, mEnd );
-    }
-};
-
-
-struct JobContext
-{
-    JobContext()
-    {
-        m_queueLock = NULL;
-        m_headIndex = 0;
-        m_tailIndex = 0;
-        m_workersShouldCheckQueue = false;
-        m_useSpinMutex = false;
-    }
-    b3CriticalSection* m_queueLock;
-    btSpinMutex m_mutex;
-    volatile bool m_workersShouldCheckQueue;
-
-    btAlignedObjectArray<IJob*> m_jobQueue;
-    bool m_queueIsEmpty;
-    int m_tailIndex;
-    int m_headIndex;
-    bool m_useSpinMutex;
-
-    void lockQueue()
-    {
-        if ( m_useSpinMutex )
-        {
-            m_mutex.lock();
-        }
-        else
-        {
-            m_queueLock->lock();
-        }
-    }
-    void unlockQueue()
-    {
-        if ( m_useSpinMutex )
-        {
-            m_mutex.unlock();
-        }
-        else
-        {
-            m_queueLock->unlock();
-        }
-    }
-    void clearQueue()
-    {
-        lockQueue();
-        m_headIndex = 0;
-        m_tailIndex = 0;
-        m_queueIsEmpty = true;
-        unlockQueue();
-        m_jobQueue.resizeNoInitialize( 0 );
-    }
-    void submitJob( IJob* job )
-    {
-        m_jobQueue.push_back( job );
-        lockQueue();
-        m_tailIndex++;
-        m_queueIsEmpty = false;
-        unlockQueue();
-    }
-    IJob* consumeJob()
-    {
-        if ( m_queueIsEmpty )
-        {
-            // lock free path. even if this is taken erroneously it isn't harmful
-            return NULL;
-        }
-        IJob* job = NULL;
-        lockQueue();
-        if ( !m_queueIsEmpty )
-        {
-            job = m_jobQueue[ m_headIndex++ ];
-            if ( m_headIndex == m_tailIndex )
-            {
-                m_queueIsEmpty = true;
-            }
-        }
-        unlockQueue();
-        return job;
-    }
-};
-
-
-struct WorkerThreadLocalStorage
-{
-    int threadId;
-    WorkerThreadStatus::Type status;
-};
-
-
-static void WorkerThreadFunc( void* userPtr, void* lsMemory )
-{
-    BT_PROFILE( "WorkerThreadFunc" );
-    WorkerThreadLocalStorage* localStorage = (WorkerThreadLocalStorage*) lsMemory;
-    localStorage->status = WorkerThreadStatus::kWaitingForWork;
-    //printf( "WorkerThreadFunc: worker %d start working\n", localStorage->threadId );
-
-    JobContext* jobContext = (JobContext*) userPtr;
-
-    while ( jobContext->m_workersShouldCheckQueue )
-    {
-        if ( IJob* job = jobContext->consumeJob() )
-        {
-            localStorage->status = WorkerThreadStatus::kWorking;
-            job->executeJob();
-            localStorage->status = WorkerThreadStatus::kWaitingForWork;
-        }
-        else
-        {
-            // todo: spin wait a bit to avoid hammering the empty queue
-        }
-    }
-
-    //printf( "WorkerThreadFunc stop working\n" );
-    localStorage->status = WorkerThreadStatus::kSleeping;
-    // go idle
-}
-
-
-static void* WorkerThreadAllocFunc()
-{
-    return new WorkerThreadLocalStorage;
-}
-
-
-
-class btTaskSchedulerDefault : public btITaskScheduler
-{
-    JobContext m_jobContext;
-    b3ThreadSupportInterface* m_threadSupport;
-    btAlignedObjectArray<ParallelForJob> m_jobs;
-    btSpinMutex m_antiNestingLock;  // prevent nested parallel-for
-    int m_numThreads;
-    int m_numWorkerThreads;
-    int m_numWorkersRunning;
-public:
-
-    btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport")
-    {
-        m_threadSupport = NULL;
-        m_numThreads = getNumHardwareThreads();
-        // if can't detect number of cores,
-        if ( m_numThreads == 0 )
-        {
-            // take a guess
-            m_numThreads = 4;
-        }
-        m_numWorkerThreads = m_numThreads - 1;
-        m_numWorkersRunning = 0;
-    }
-
-    virtual ~btTaskSchedulerDefault()
-    {
-        shutdown();
-    }
-
-    void init()
-    {
-        int maxNumWorkerThreads = BT_MAX_THREAD_COUNT - 1;
-        m_threadSupport = createThreadSupport( maxNumWorkerThreads, WorkerThreadFunc, WorkerThreadAllocFunc, "TaskScheduler" );
-        m_jobContext.m_queueLock = m_threadSupport->createCriticalSection();
-        for ( int i = 0; i < maxNumWorkerThreads; i++ )
-        {
-            WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i );
-            btAssert( storage );
-            storage->threadId = i;
-            storage->status = WorkerThreadStatus::kSleeping;
-        }
-        setWorkersActive( false ); // no work for them yet
-    }
-
-    virtual void shutdown()
-    {
-        setWorkersActive( false );
-        waitForWorkersToSleep();
-        m_threadSupport->deleteCriticalSection( m_jobContext.m_queueLock );
-        m_jobContext.m_queueLock = NULL;
-
-        delete m_threadSupport;
-        m_threadSupport = NULL;
-    }
-
-    void setWorkersActive( bool active )
-    {
-        m_jobContext.m_workersShouldCheckQueue = active;
-    }
-
-    virtual int getMaxNumThreads() const BT_OVERRIDE
-    {
-        return BT_MAX_THREAD_COUNT;
-    }
-
-    virtual int getNumThreads() const BT_OVERRIDE
-    {
-        return m_numThreads;
-    }
-
-    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
-    {
-        m_numThreads = btMax( btMin(numThreads, int(BT_MAX_THREAD_COUNT)), 1 );
-        m_numWorkerThreads = m_numThreads - 1;
-    }
-
-    void waitJobs()
-    {
-        BT_PROFILE( "waitJobs" );
-        // have the main thread work until the job queue is empty
-        for ( ;; )
-        {
-            if ( IJob* job = m_jobContext.consumeJob() )
-            {
-                job->executeJob();
-            }
-            else
-            {
-                break;
-            }
-        }
-        // done with jobs for now, tell workers to rest
-        setWorkersActive( false );
-        waitForWorkersToSleep();
-    }
-
-    void wakeWorkers()
-    {
-        BT_PROFILE( "wakeWorkers" );
-        btAssert( m_jobContext.m_workersShouldCheckQueue );
-        // tell each worker thread to start working
-        for ( int i = 0; i < m_numWorkerThreads; i++ )
-        {
-            m_threadSupport->runTask( B3_THREAD_SCHEDULE_TASK, &m_jobContext, i );
-            m_numWorkersRunning++;
-        }
-    }
-
-    void waitForWorkersToSleep()
-    {
-        BT_PROFILE( "waitForWorkersToSleep" );
-        while ( m_numWorkersRunning > 0 )
-        {
-            int iThread;
-            int threadStatus;
-            m_threadSupport->waitForResponse( &iThread, &threadStatus );  // wait for worker threads to finish working
-            m_numWorkersRunning--;
-        }
-        //m_threadSupport->waitForAllTasksToComplete();
-        for ( int i = 0; i < m_numWorkerThreads; i++ )
-        {
-            //m_threadSupport->waitForTaskCompleted( i );
-            WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i );
-            btAssert( storage );
-            btAssert( storage->status == WorkerThreadStatus::kSleeping );
-        }
-    }
-
-    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
-    {
-        BT_PROFILE( "parallelFor_ThreadSupport" );
-        btAssert( iEnd >= iBegin );
-        btAssert( grainSize >= 1 );
-        int iterationCount = iEnd - iBegin;
-        if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() )
-        {
-            int jobCount = ( iterationCount + grainSize - 1 ) / grainSize;
-            btAssert( jobCount >= 2 );  // need more than one job for multithreading
-            if ( jobCount > m_jobs.size() )
-            {
-                m_jobs.resize( jobCount );
-            }
-            if ( jobCount > m_jobContext.m_jobQueue.capacity() )
-            {
-                m_jobContext.m_jobQueue.reserve( jobCount );
-            }
-
-            m_jobContext.clearQueue();
-            // prepare worker threads for incoming work
-            setWorkersActive( true );
-            wakeWorkers();
-            // submit all of the jobs
-            int iJob = 0;
-            for ( int i = iBegin; i < iEnd; i += grainSize )
-            {
-                btAssert( iJob < jobCount );
-                int iE = btMin( i + grainSize, iEnd );
-                ParallelForJob& job = m_jobs[ iJob ];
-                job.init( i, iE, body );
-                m_jobContext.submitJob( &job );
-                iJob++;
-            }
-
-            // put the main thread to work on emptying the job queue and then wait for all workers to finish
-            waitJobs();
-            m_antiNestingLock.unlock();
-        }
-        else
-        {
-            BT_PROFILE( "parallelFor_mainThread" );
-            // just run on main thread
-            body.forLoop( iBegin, iEnd );
-        }
-    }
-};
-
-
-
-btITaskScheduler* createDefaultTaskScheduler()
-{
-    btTaskSchedulerDefault* ts = new btTaskSchedulerDefault();
-    ts->init();
-    return ts;
-}
-
-#else // #if BT_THREADSAFE
-
-btITaskScheduler* createDefaultTaskScheduler()
-{
-    return NULL;
-}
-
-#endif // #else // #if BT_THREADSAFE
--- a/examples/MultiThreading/btTaskScheduler.h
+++ b/examples/MultiThreading/btTaskScheduler.h
@@ -1,26 +0,0 @@
-/*
-Copyright (c) 2003-2014 Erwin Coumans  http://bullet.googlecode.com
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it freely,
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-
-#ifndef BT_TASK_SCHEDULER_H
-#define BT_TASK_SCHEDULER_H
-
-
-class btITaskScheduler;
-
-btITaskScheduler* createDefaultTaskScheduler();
-
-
-#endif // BT_TASK_SCHEDULER_H