Merge pull request #1168 from lunkhound/pr-fix-thread-index

fix various multithreading issues with thread indexes
2017-06-05 13:27:05 -07:00
parent 0d7232c661 6bf3d4e08e
commit be9384a01b
3 changed files with 461 additions and 262 deletions
--- a/src/LinearMath/btQuickprof.cpp
+++ b/src/LinearMath/btQuickprof.cpp
@@ -14,7 +14,7 @@
 // Ogre (www.ogre3d.org).

 #include "btQuickprof.h"
-
+#include "btThreads.h"



@@ -685,6 +685,9 @@ void	CProfileManager::dumpAll()

 unsigned int btQuickprofGetCurrentThreadIndex2()
 {
+#if BT_THREADSAFE
+    return btGetCurrentThreadIndex();
+#else // #if BT_THREADSAFE
 	const unsigned int kNullIndex = ~0U;
 #ifdef _WIN32
    #if defined(__MINGW32__) || defined(__MINGW64__)
@@ -717,6 +720,7 @@ unsigned int btQuickprofGetCurrentThreadIndex2()
 		sThreadIndex = gThreadCounter++;
 	}
 	return sThreadIndex;
+#endif // #else // #if BT_THREADSAFE
 }

 void	btEnterProfileZoneDefault(const char* name)
--- a/src/LinearMath/btThreads.cpp
+++ b/src/LinearMath/btThreads.cpp
@@ -17,26 +17,25 @@ subject to the following restrictions:
 #include "btQuickprof.h"
 #include <algorithm>  // for min and max

-#if BT_THREADSAFE

-#if BT_USE_OPENMP
+#if BT_USE_OPENMP && BT_THREADSAFE

 #include <omp.h>

-#endif // #if BT_USE_OPENMP
+#endif // #if BT_USE_OPENMP && BT_THREADSAFE


-#if BT_USE_PPL
+#if BT_USE_PPL && BT_THREADSAFE

 // use Microsoft Parallel Patterns Library (installed with Visual Studio 2010 and later)
 #include <ppl.h>  // if you get a compile error here, check whether your version of Visual Studio includes PPL
 // Visual Studio 2010 and later should come with it
 #include <concrtrm.h>  // for GetProcessorCount()

-#endif // #if BT_USE_PPL
+#endif // #if BT_USE_PPL && BT_THREADSAFE


-#if BT_USE_TBB
+#if BT_USE_TBB && BT_THREADSAFE

 // use Intel Threading Building Blocks for thread management
 #define __TBB_NO_IMPLICIT_LINKAGE 1
@@ -45,224 +44,10 @@ subject to the following restrictions:
 #include <tbb/parallel_for.h>
 #include <tbb/blocked_range.h>

-#endif // #if BT_USE_TBB
-
-
-static btITaskScheduler* gBtTaskScheduler;
-static int gThreadsRunningCounter = 0;  // useful for detecting if we are trying to do nested parallel-for calls
-static btSpinMutex gThreadsRunningCounterMutex;
-
-void btPushThreadsAreRunning()
-{
-    gThreadsRunningCounterMutex.lock();
-    gThreadsRunningCounter++;
-    gThreadsRunningCounterMutex.unlock();
-}
-
-void btPopThreadsAreRunning()
-{
-    gThreadsRunningCounterMutex.lock();
-    gThreadsRunningCounter--;
-    gThreadsRunningCounterMutex.unlock();
-}
-
-bool btThreadsAreRunning()
-{
-    return gThreadsRunningCounter != 0;
-}
-
-
-void btSetTaskScheduler( btITaskScheduler* ts )
-{
-    gBtTaskScheduler = ts;
-}
-
-btITaskScheduler* btGetTaskScheduler()
-{
-    return gBtTaskScheduler;
-}
-
-void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body )
-{
-    gBtTaskScheduler->parallelFor( iBegin, iEnd, grainSize, body );
-}
-
-
-#if BT_USE_OPENMP
-///
-/// btTaskSchedulerOpenMP -- OpenMP task scheduler implementation
-///
-class btTaskSchedulerOpenMP : public btITaskScheduler
-{
-    int m_numThreads;
-public:
-    btTaskSchedulerOpenMP() : btITaskScheduler( "OpenMP" )
-    {
-        m_numThreads = 0;
-    }
-    virtual int getMaxNumThreads() const BT_OVERRIDE
-    {
-        return omp_get_max_threads();
-    }
-    virtual int getNumThreads() const BT_OVERRIDE
-    {
-        return m_numThreads;
-    }
-    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
-    {
-        m_numThreads = ( std::max )( 1, numThreads );
-        omp_set_num_threads( m_numThreads );
-    }
-    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
-    {
-        BT_PROFILE( "parallelFor_OpenMP" );
-        btPushThreadsAreRunning();
-#pragma omp parallel for schedule( static, 1 )
-        for ( int i = iBegin; i < iEnd; i += grainSize )
-        {
-            BT_PROFILE( "OpenMP_job" );
-            body.forLoop( i, ( std::min )( i + grainSize, iEnd ) );
-        }
-        btPopThreadsAreRunning();
-    }
-};
-#endif // #if BT_USE_OPENMP
-
-
-#if BT_USE_TBB
-///
-/// btTaskSchedulerTBB -- task scheduler implemented via Intel Threaded Building Blocks
-///
-class btTaskSchedulerTBB : public btITaskScheduler
-{
-    int m_numThreads;
-    tbb::task_scheduler_init* m_tbbSchedulerInit;
-
-public:
-    btTaskSchedulerTBB() : btITaskScheduler( "IntelTBB" )
-    {
-        m_numThreads = 0;
-        m_tbbSchedulerInit = NULL;
-    }
-    ~btTaskSchedulerTBB()
-    {
-        if ( m_tbbSchedulerInit )
-        {
-            delete m_tbbSchedulerInit;
-            m_tbbSchedulerInit = NULL;
-        }
-    }
-
-    virtual int getMaxNumThreads() const BT_OVERRIDE
-    {
-        return tbb::task_scheduler_init::default_num_threads();
-    }
-    virtual int getNumThreads() const BT_OVERRIDE
-    {
-        return m_numThreads;
-    }
-    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
-    {
-        m_numThreads = ( std::max )( 1, numThreads );
-        if ( m_tbbSchedulerInit )
-        {
-            delete m_tbbSchedulerInit;
-            m_tbbSchedulerInit = NULL;
-        }
-        m_tbbSchedulerInit = new tbb::task_scheduler_init( m_numThreads );
-    }
-    struct BodyAdapter
-    {
-        const btIParallelForBody* mBody;
-
-        void operator()( const tbb::blocked_range<int>& range ) const
-        {
-            BT_PROFILE( "TBB_job" );
-            mBody->forLoop( range.begin(), range.end() );
-        }
-    };
-    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
-    {
-        BT_PROFILE( "parallelFor_TBB" );
-        // TBB dispatch
-        BodyAdapter tbbBody;
-        tbbBody.mBody = &body;
-        btPushThreadsAreRunning();
-        tbb::parallel_for( tbb::blocked_range<int>( iBegin, iEnd, grainSize ),
-            tbbBody,
-            tbb::simple_partitioner()
-        );
-        btPopThreadsAreRunning();
-    }
-};
-#endif // #if BT_USE_TBB
-
-#if BT_USE_PPL
-///
-/// btTaskSchedulerPPL -- task scheduler implemented via Microsoft Parallel Patterns Lib
-///
-class btTaskSchedulerPPL : public btITaskScheduler
-{
-    int m_numThreads;
-public:
-    btTaskSchedulerPPL() : btITaskScheduler( "PPL" )
-    {
-        m_numThreads = 0;
-    }
-    virtual int getMaxNumThreads() const BT_OVERRIDE
-    {
-        return concurrency::GetProcessorCount();
-    }
-    virtual int getNumThreads() const BT_OVERRIDE
-    {
-        return m_numThreads;
-    }
-    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
-    {
-        m_numThreads = ( std::max )( 1, numThreads );
-        using namespace concurrency;
-        if ( CurrentScheduler::Id() != -1 )
-        {
-            CurrentScheduler::Detach();
-        }
-        SchedulerPolicy policy;
-        policy.SetConcurrencyLimits( m_numThreads, m_numThreads );
-        CurrentScheduler::Create( policy );
-    }
-    struct BodyAdapter
-    {
-        const btIParallelForBody* mBody;
-        int mGrainSize;
-        int mIndexEnd;
-
-        void operator()( int i ) const
-        {
-            BT_PROFILE( "PPL_job" );
-            mBody->forLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) );
-        }
-    };
-    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
-    {
-        BT_PROFILE( "parallelFor_PPL" );
-        // PPL dispatch
-        BodyAdapter pplBody;
-        pplBody.mBody = &body;
-        pplBody.mGrainSize = grainSize;
-        pplBody.mIndexEnd = iEnd;
-        btPushThreadsAreRunning();
-        // note: MSVC 2010 doesn't support partitioner args, so avoid them
-        concurrency::parallel_for( iBegin,
-            iEnd,
-            grainSize,
-            pplBody
-        );
-        btPopThreadsAreRunning();
-    }
-};
-#endif // #if BT_USE_PPL
-
+#endif // #if BT_USE_TBB && BT_THREADSAFE


+#if BT_THREADSAFE
 //
 // Lightweight spin-mutex based on atomics
 // Using ordinary system-provided mutexes like Windows critical sections was noticeably slower
@@ -415,28 +200,107 @@ void btSpinMutex::unlock()

 #endif  //#else //#elif USE_MSVC_INTRINSICS

+#else //#if BT_THREADSAFE
+
+// These should not be called ever
+void btSpinMutex::lock()
+{
+    btAssert( !"unimplemented btSpinMutex::lock() called" );
+}
+
+void btSpinMutex::unlock()
+{
+    btAssert( !"unimplemented btSpinMutex::unlock() called" );
+}
+
+bool btSpinMutex::tryLock()
+{
+    btAssert( !"unimplemented btSpinMutex::tryLock() called" );
+    return true;
+}
+
+#define THREAD_LOCAL_STATIC static
+
+#endif // #else //#if BT_THREADSAFE
+

 struct ThreadsafeCounter
 {
    unsigned int mCounter;
    btSpinMutex mMutex;

-    ThreadsafeCounter() {mCounter=0;}
+    ThreadsafeCounter()
+    {
+        mCounter = 0;
+        --mCounter; // first count should come back 0
+    }

    unsigned int getNext()
    {
        // no need to optimize this with atomics, it is only called ONCE per thread!
        mMutex.lock();
-        unsigned int val = mCounter++;
+        mCounter++;
+        if ( mCounter >= BT_MAX_THREAD_COUNT )
+        {
+            btAssert( !"thread counter exceeded" );
+            // wrap back to the first worker index
+            mCounter = 1;
+        }
+        unsigned int val = mCounter;
        mMutex.unlock();
        return val;
    }
 };

+
+static btITaskScheduler* gBtTaskScheduler;
+static int gThreadsRunningCounter = 0;  // useful for detecting if we are trying to do nested parallel-for calls
+static btSpinMutex gThreadsRunningCounterMutex;
 static ThreadsafeCounter gThreadCounter;


-// return a unique index per thread, starting with 0 and counting up
+//
+// BT_DETECT_BAD_THREAD_INDEX tries to detect when there are multiple threads assigned the same thread index.
+//
+// BT_DETECT_BAD_THREAD_INDEX is a developer option to test if
+// certain assumptions about how the task scheduler manages its threads
+// holds true.
+// The main assumption is:
+//   - when the threadpool is resized, the task scheduler either
+//      1. destroys all worker threads and creates all new ones in the correct number, OR
+//      2. never destroys a worker thread
+//
+// We make that assumption because we can't easily enumerate the worker threads of a task scheduler
+// to assign nice sequential thread-indexes. We also do not get notified if a worker thread is destroyed,
+// so we can't tell when a thread-index is no longer being used.
+// We allocate thread-indexes as needed with a sequential global thread counter.
+//
+// Our simple thread-counting scheme falls apart if the task scheduler destroys some threads but
+// continues to re-use other threads and the application repeatedly resizes the thread pool of the 
+// task scheduler.
+// In order to prevent the thread-counter from exceeding the global max (BT_MAX_THREAD_COUNT), we
+// wrap the thread counter back to 1. This should only happen if the worker threads have all been
+// destroyed and re-created.
+//
+// BT_DETECT_BAD_THREAD_INDEX only works for Win32 right now,
+// but could be adapted to work with pthreads
+#define BT_DETECT_BAD_THREAD_INDEX 0
+
+#if BT_DETECT_BAD_THREAD_INDEX
+
+typedef DWORD ThreadId_t;
+const static ThreadId_t kInvalidThreadId = 0;
+ThreadId_t gDebugThreadIds[ BT_MAX_THREAD_COUNT ];
+
+static ThreadId_t getDebugThreadId()
+{
+    return GetCurrentThreadId();
+}
+
+#endif // #if BT_DETECT_BAD_THREAD_INDEX
+
+
+// return a unique index per thread, main thread is 0, worker threads are in [1, BT_MAX_THREAD_COUNT)
 unsigned int btGetCurrentThreadIndex()
 {
    const unsigned int kNullIndex = ~0U;
@@ -444,7 +308,30 @@ unsigned int btGetCurrentThreadIndex()
    if ( sThreadIndex == kNullIndex )
    {
        sThreadIndex = gThreadCounter.getNext();
+        btAssert( sThreadIndex < BT_MAX_THREAD_COUNT );
    }
+#if BT_DETECT_BAD_THREAD_INDEX
+    if ( gBtTaskScheduler && sThreadIndex > 0 )
+    {
+        ThreadId_t tid = getDebugThreadId();
+        // if not set
+        if ( gDebugThreadIds[ sThreadIndex ] == kInvalidThreadId )
+        {
+            // set it
+            gDebugThreadIds[ sThreadIndex ] = tid;
+        }
+        else
+        {
+            if ( gDebugThreadIds[ sThreadIndex ] != tid )
+            {
+                // this could indicate the task scheduler is breaking our assumptions about
+                // how threads are managed when threadpool is resized
+                btAssert( !"there are 2 or more threads with the same thread-index!" );
+                __debugbreak();
+            }
+        }
+    }
+#endif // #if BT_DETECT_BAD_THREAD_INDEX
    return sThreadIndex;
 }

@@ -453,38 +340,123 @@ bool btIsMainThread()
    return btGetCurrentThreadIndex() == 0;
 }

-#else // #if BT_THREADSAFE
-
-// These should not be called ever
-void btSpinMutex::lock()
+void btResetThreadIndexCounter()
 {
-    btAssert(!"unimplemented btSpinMutex::lock() called");
+    // for when all current worker threads are destroyed
+    btAssert( btIsMainThread() );
+    gThreadCounter.mCounter = 0;
 }

-void btSpinMutex::unlock()
+btITaskScheduler::btITaskScheduler( const char* name )
 {
-    btAssert(!"unimplemented btSpinMutex::unlock() called");
+    m_name = name;
+    m_savedThreadCounter = 0;
+    m_isActive = false;
 }

-bool btSpinMutex::tryLock()
+void btITaskScheduler::activate()
 {
-    btAssert(!"unimplemented btSpinMutex::tryLock() called");
-    return true;
+    // gThreadCounter is used to assign a thread-index to each worker thread in a task scheduler.
+    // The main thread is always thread-index 0, and worker threads are numbered from 1 to 63 (BT_MAX_THREAD_COUNT-1)
+    // The thread-indexes need to be unique amongst the threads that can be running simultaneously.
+    // Since only one task scheduler can be used at a time, it is OK for a pair of threads that belong to different
+    // task schedulers to share the same thread index because they can't be running at the same time.
+    // So each task scheduler needs to keep its own thread counter value
+    if ( !m_isActive )
+    {
+        gThreadCounter.mCounter = m_savedThreadCounter;  // restore saved thread counter
+        m_isActive = true;
+    }
 }

-// non-parallel version of btParallelFor
+void btITaskScheduler::deactivate()
+{
+    if ( m_isActive )
+    {
+        m_savedThreadCounter = gThreadCounter.mCounter;  // save thread counter
+        m_isActive = false;
+    }
+}
+
+void btPushThreadsAreRunning()
+{
+    gThreadsRunningCounterMutex.lock();
+    gThreadsRunningCounter++;
+    gThreadsRunningCounterMutex.unlock();
+}
+
+void btPopThreadsAreRunning()
+{
+    gThreadsRunningCounterMutex.lock();
+    gThreadsRunningCounter--;
+    gThreadsRunningCounterMutex.unlock();
+}
+
+bool btThreadsAreRunning()
+{
+    return gThreadsRunningCounter != 0;
+}
+
+
+void btSetTaskScheduler( btITaskScheduler* ts )
+{
+    int threadId = btGetCurrentThreadIndex();  // make sure we call this on main thread at least once before any workers run
+    if ( threadId != 0 )
+    {
+        btAssert( !"btSetTaskScheduler must be called from the main thread!" );
+        return;
+    }
+    if ( gBtTaskScheduler )
+    {
+        // deactivate old task scheduler
+        gBtTaskScheduler->deactivate();
+    }
+    gBtTaskScheduler = ts;
+    if ( ts )
+    {
+        // activate new task scheduler
+        ts->activate();
+    }
+}
+
+
+btITaskScheduler* btGetTaskScheduler()
+{
+    return gBtTaskScheduler;
+}
+
+
 void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body )
 {
-    btAssert(!"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE");
-    body.forLoop( iBegin, iEnd );
-}
+#if BT_THREADSAFE

-#endif // #if BT_THREADSAFE
+#if BT_DETECT_BAD_THREAD_INDEX
+    if ( !btThreadsAreRunning() )
+    {
+        // clear out thread ids
+        for ( int i = 0; i < BT_MAX_THREAD_COUNT; ++i )
+        {
+            gDebugThreadIds[ i ] = kInvalidThreadId;
+        }
+    }
+#endif // #if BT_DETECT_BAD_THREAD_INDEX
+
+    btAssert( gBtTaskScheduler != NULL );  // call btSetTaskScheduler() with a valid task scheduler first!
+    gBtTaskScheduler->parallelFor( iBegin, iEnd, grainSize, body );
+
+#else // #if BT_THREADSAFE
+
+    // non-parallel version of btParallelFor
+    btAssert( !"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE" );
+    body.forLoop( iBegin, iEnd );
+
+#endif #else // #if BT_THREADSAFE
+}


 ///
 /// btTaskSchedulerSequential -- non-threaded implementation of task scheduler
-///                              (fallback in case no multi-threaded schedulers are available)
+///                              (really just useful for testing performance of single threaded vs multi)
 ///
 class btTaskSchedulerSequential : public btITaskScheduler
 {
@@ -500,6 +472,211 @@ public:
    }
 };

+
+#if BT_USE_OPENMP && BT_THREADSAFE
+///
+/// btTaskSchedulerOpenMP -- wrapper around OpenMP task scheduler
+///
+class btTaskSchedulerOpenMP : public btITaskScheduler
+{
+    int m_numThreads;
+public:
+    btTaskSchedulerOpenMP() : btITaskScheduler( "OpenMP" )
+    {
+        m_numThreads = 0;
+    }
+    virtual int getMaxNumThreads() const BT_OVERRIDE
+    {
+        return omp_get_max_threads();
+    }
+    virtual int getNumThreads() const BT_OVERRIDE
+    {
+        return m_numThreads;
+    }
+    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
+    {
+        // With OpenMP, because it is a standard with various implementations, we can't
+        // know for sure if every implementation has the same behavior of destroying all
+        // previous threads when resizing the threadpool
+        m_numThreads = ( std::max )( 1, ( std::min )( int( BT_MAX_THREAD_COUNT ), numThreads ) );
+        omp_set_num_threads( 1 );  // hopefully, all previous threads get destroyed here
+        omp_set_num_threads( m_numThreads );
+        m_savedThreadCounter = 0;
+        if ( m_isActive )
+        {
+            btResetThreadIndexCounter();
+        }
+    }
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelFor_OpenMP" );
+        btPushThreadsAreRunning();
+#pragma omp parallel for schedule( static, 1 )
+        for ( int i = iBegin; i < iEnd; i += grainSize )
+        {
+            BT_PROFILE( "OpenMP_job" );
+            body.forLoop( i, ( std::min )( i + grainSize, iEnd ) );
+        }
+        btPopThreadsAreRunning();
+    }
+};
+#endif // #if BT_USE_OPENMP && BT_THREADSAFE
+
+
+#if BT_USE_TBB && BT_THREADSAFE
+///
+/// btTaskSchedulerTBB -- wrapper around Intel Threaded Building Blocks task scheduler
+///
+class btTaskSchedulerTBB : public btITaskScheduler
+{
+    int m_numThreads;
+    tbb::task_scheduler_init* m_tbbSchedulerInit;
+
+public:
+    btTaskSchedulerTBB() : btITaskScheduler( "IntelTBB" )
+    {
+        m_numThreads = 0;
+        m_tbbSchedulerInit = NULL;
+    }
+    ~btTaskSchedulerTBB()
+    {
+        if ( m_tbbSchedulerInit )
+        {
+            delete m_tbbSchedulerInit;
+            m_tbbSchedulerInit = NULL;
+        }
+    }
+
+    virtual int getMaxNumThreads() const BT_OVERRIDE
+    {
+        return tbb::task_scheduler_init::default_num_threads();
+    }
+    virtual int getNumThreads() const BT_OVERRIDE
+    {
+        return m_numThreads;
+    }
+    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
+    {
+        m_numThreads = ( std::max )( 1, ( std::min )( int(BT_MAX_THREAD_COUNT), numThreads ) );
+        if ( m_tbbSchedulerInit )
+        {
+            // destroys all previous threads
+            delete m_tbbSchedulerInit;
+            m_tbbSchedulerInit = NULL;
+        }
+        m_tbbSchedulerInit = new tbb::task_scheduler_init( m_numThreads );
+        m_savedThreadCounter = 0;
+        if ( m_isActive )
+        {
+            btResetThreadIndexCounter();
+        }
+    }
+    struct BodyAdapter
+    {
+        const btIParallelForBody* mBody;
+
+        void operator()( const tbb::blocked_range<int>& range ) const
+        {
+            BT_PROFILE( "TBB_job" );
+            mBody->forLoop( range.begin(), range.end() );
+        }
+    };
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelFor_TBB" );
+        // TBB dispatch
+        BodyAdapter tbbBody;
+        tbbBody.mBody = &body;
+        btPushThreadsAreRunning();
+        tbb::parallel_for( tbb::blocked_range<int>( iBegin, iEnd, grainSize ),
+            tbbBody,
+            tbb::simple_partitioner()
+        );
+        btPopThreadsAreRunning();
+    }
+};
+#endif // #if BT_USE_TBB && BT_THREADSAFE
+
+
+#if BT_USE_PPL && BT_THREADSAFE
+///
+/// btTaskSchedulerPPL -- wrapper around Microsoft Parallel Patterns Lib task scheduler
+///
+class btTaskSchedulerPPL : public btITaskScheduler
+{
+    int m_numThreads;
+public:
+    btTaskSchedulerPPL() : btITaskScheduler( "PPL" )
+    {
+        m_numThreads = 0;
+    }
+    virtual int getMaxNumThreads() const BT_OVERRIDE
+    {
+        return concurrency::GetProcessorCount();
+    }
+    virtual int getNumThreads() const BT_OVERRIDE
+    {
+        return m_numThreads;
+    }
+    virtual void setNumThreads( int numThreads ) BT_OVERRIDE
+    {
+        // capping the thread count for PPL due to a thread-index issue
+        const int maxThreadCount = (std::min)(int(BT_MAX_THREAD_COUNT), 31);
+        m_numThreads = ( std::max )( 1, ( std::min )( maxThreadCount, numThreads ) );
+        using namespace concurrency;
+        if ( CurrentScheduler::Id() != -1 )
+        {
+            CurrentScheduler::Detach();
+        }
+        SchedulerPolicy policy;
+        {
+            // PPL seems to destroy threads when threadpool is shrunk, but keeps reusing old threads
+            // force it to destroy old threads
+            policy.SetConcurrencyLimits( 1, 1 );
+            CurrentScheduler::Create( policy );
+            CurrentScheduler::Detach();
+        }
+        policy.SetConcurrencyLimits( m_numThreads, m_numThreads );
+        CurrentScheduler::Create( policy );
+        m_savedThreadCounter = 0;
+        if ( m_isActive )
+        {
+            btResetThreadIndexCounter();
+        }
+    }
+    struct BodyAdapter
+    {
+        const btIParallelForBody* mBody;
+        int mGrainSize;
+        int mIndexEnd;
+
+        void operator()( int i ) const
+        {
+            BT_PROFILE( "PPL_job" );
+            mBody->forLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) );
+        }
+    };
+    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE
+    {
+        BT_PROFILE( "parallelFor_PPL" );
+        // PPL dispatch
+        BodyAdapter pplBody;
+        pplBody.mBody = &body;
+        pplBody.mGrainSize = grainSize;
+        pplBody.mIndexEnd = iEnd;
+        btPushThreadsAreRunning();
+        // note: MSVC 2010 doesn't support partitioner args, so avoid them
+        concurrency::parallel_for( iBegin,
+            iEnd,
+            grainSize,
+            pplBody
+        );
+        btPopThreadsAreRunning();
+    }
+};
+#endif // #if BT_USE_PPL && BT_THREADSAFE
+
+
 // create a non-threaded task scheduler (always available)
 btITaskScheduler* btGetSequentialTaskScheduler()
 {
--- a/src/LinearMath/btThreads.h
+++ b/src/LinearMath/btThreads.h
@@ -28,6 +28,14 @@ subject to the following restrictions:
 #define BT_OVERRIDE
 #endif

+const unsigned int BT_MAX_THREAD_COUNT = 64;  // only if BT_THREADSAFE is 1
+
+// for internal use only
+bool btIsMainThread();
+bool btThreadsAreRunning();
+unsigned int btGetCurrentThreadIndex();
+void btResetThreadIndexCounter(); // notify that all worker threads have been destroyed
+
 ///
 /// btSpinMutex -- lightweight spin-mutex implemented with atomic ops, never puts
 ///               a thread to sleep because it is designed to be used with a task scheduler
@@ -48,39 +56,41 @@ public:
    bool tryLock();
 };

-#if BT_THREADSAFE

-// for internal Bullet use only
+//
+// NOTE: btMutex* is for internal Bullet use only
+//
+// If BT_THREADSAFE is undefined or 0, should optimize away to nothing.
+// This is good because for the single-threaded build of Bullet, any calls
+// to these functions will be optimized out.
+//
+// However, for users of the multi-threaded build of Bullet this is kind
+// of bad because if you call any of these functions from external code
+// (where BT_THREADSAFE is undefined) you will get unexpected race conditions.
+//
 SIMD_FORCE_INLINE void btMutexLock( btSpinMutex* mutex )
 {
+#if BT_THREADSAFE
    mutex->lock();
+#endif // #if BT_THREADSAFE
 }

 SIMD_FORCE_INLINE void btMutexUnlock( btSpinMutex* mutex )
 {
+#if BT_THREADSAFE
    mutex->unlock();
+#endif // #if BT_THREADSAFE
 }

 SIMD_FORCE_INLINE bool btMutexTryLock( btSpinMutex* mutex )
 {
+#if BT_THREADSAFE
    return mutex->tryLock();
+#else
+    return true;
+#endif // #if BT_THREADSAFE
 }

-// for internal use only
-bool btIsMainThread();
-bool btThreadsAreRunning();
-unsigned int btGetCurrentThreadIndex();
-const unsigned int BT_MAX_THREAD_COUNT = 64;
-
-#else
-
-// for internal Bullet use only
-// if BT_THREADSAFE is undefined or 0, should optimize away to nothing
-SIMD_FORCE_INLINE void btMutexLock( btSpinMutex* ) {}
-SIMD_FORCE_INLINE void btMutexUnlock( btSpinMutex* ) {}
-SIMD_FORCE_INLINE bool btMutexTryLock( btSpinMutex* ) {return true;}
-SIMD_FORCE_INLINE bool btThreadsAreRunning() { return false;}
-#endif

 //
 // btIParallelForBody -- subclass this to express work that can be done in parallel
@@ -97,16 +107,24 @@ public:
 //
 class btITaskScheduler
 {
-    const char* m_name;
 public:
-    btITaskScheduler( const char* name ) : m_name( name ) {}
+    btITaskScheduler( const char* name );
+    virtual ~btITaskScheduler() {}
    const char* getName() const { return m_name; }

-    virtual ~btITaskScheduler() {}
    virtual int getMaxNumThreads() const = 0;
    virtual int getNumThreads() const = 0;
    virtual void setNumThreads( int numThreads ) = 0;
    virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) = 0;
+
+    // internal use only
+    virtual void activate();
+    virtual void deactivate();
+
+protected:
+    const char* m_name;
+    unsigned int m_savedThreadCounter;
+    bool m_isActive;
 };

 // set the task scheduler to use for all calls to btParallelFor()