From b8720f2161ad5a27693341b9ef47ec2380a4a972 Mon Sep 17 00:00:00 2001 From: Lunkhound Date: Sun, 4 Jun 2017 17:57:25 -0700 Subject: [PATCH 1/8] parallel solver: various changes - threading: adding btSequentialImpulseConstraintSolverMt - task scheduler: added parallelSum so that parallel solver can compute residuals - CommonRigidBodyMTBase: add slider for solver least squares residual and allow multithreading without needing OpenMP, TBB, or PPL - taskScheduler: don't wait for workers to sleep/signal at the end of each parallel block - parallel solver: convertContacts split into an allocContactConstraints and setupContactConstraints stage, the latter of which is done in parallel - parallel solver: rolling friction is now interleaved along with normal friction - parallel solver: batchified split impulse solving + some cleanup - parallel solver: sorting batches from largest to smallest - parallel solver: added parallel batch creation - parallel solver: added warmstartingWriteBackContacts func + other cleanup - task scheduler: truncate low bits to preserve determinism with parallelSum - parallel solver: reducing dynamic mem allocs and trying to parallelize more of the batch setup - parallel solver: parallelize updating constraint batch ids for merging - parallel solver: adding debug visualization - task scheduler: make TBB task scheduler parallelSum deterministic - parallel solver: split batch gen code into separate file; allow selection of batch gen method - task scheduler: add sleepWorkerThreadsHint() at end of simulation - parallel solver: added grain size per phase - task Scheduler: fix for strange threading issue; also no need for main thread to wait for workers to sleep - base constraint solver: break out joint setup into separate function for profiling/overriding - parallel solver: allow different batching method for contacts vs joints - base constraint solver: add convertJoint and convertBodies to make it possible to parallelize joint and body conversion - parallel solver: convert joints and bodies in parallel now - parallel solver: speed up batch creation with run-length encoding - parallel solver: batch gen: run-length expansion in parallel; collect constraint info in parallel - parallel solver: adding spatial grid batching method - parallel solver: enhancements to spatial grid batching - sequential solver: moving code for writing back into functions that derived classes can call - parallel solver: do write back of bodies and joints in parallel - parallel solver: removed all batching methods except for spatial grid (others were ineffective) - parallel solver: added 2D or 3D grid batching options; and a bit of cleanup - move btDefaultTaskScheduler into LinearMath project --- CMakeLists.txt | 13 +- examples/ExampleBrowser/CMakeLists.txt | 1 - .../CommonRigidBodyMTBase.cpp | 235 ++- .../MultiThreadedDemo/CommonRigidBodyMTBase.h | 2 + .../MultiThreadedDemo/MultiThreadedDemo.cpp | 31 +- examples/MultiThreading/btTaskScheduler.cpp | 448 ----- examples/MultiThreading/btTaskScheduler.h | 26 - src/BulletDynamics/CMakeLists.txt | 3 + .../ConstraintSolver/btBatchedConstraints.cpp | 1129 ++++++++++++ .../ConstraintSolver/btBatchedConstraints.h | 66 + .../btSequentialImpulseConstraintSolver.cpp | 530 +++--- .../btSequentialImpulseConstraintSolver.h | 8 +- .../btSequentialImpulseConstraintSolverMt.cpp | 1611 +++++++++++++++++ .../btSequentialImpulseConstraintSolverMt.h | 154 ++ .../Dynamics/btDiscreteDynamicsWorldMt.cpp | 11 + .../Dynamics/btDiscreteDynamicsWorldMt.h | 2 + .../Dynamics/btSimulationIslandManagerMt.cpp | 43 +- .../Dynamics/btSimulationIslandManagerMt.h | 2 + src/LinearMath/CMakeLists.txt | 4 + .../TaskScheduler/btTaskScheduler.cpp | 619 +++++++ .../TaskScheduler/btThreadSupportInterface.h | 75 + .../TaskScheduler/btThreadSupportPosix.cpp | 369 ++++ .../TaskScheduler/btThreadSupportWin32.cpp | 480 +++++ src/LinearMath/btThreads.cpp | 121 +- src/LinearMath/btThreads.h | 20 + 25 files changed, 5236 insertions(+), 767 deletions(-) delete mode 100644 examples/MultiThreading/btTaskScheduler.cpp delete mode 100644 examples/MultiThreading/btTaskScheduler.h create mode 100644 src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp create mode 100644 src/BulletDynamics/ConstraintSolver/btBatchedConstraints.h create mode 100644 src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp create mode 100644 src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h create mode 100644 src/LinearMath/TaskScheduler/btTaskScheduler.cpp create mode 100644 src/LinearMath/TaskScheduler/btThreadSupportInterface.h create mode 100644 src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp create mode 100644 src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index c14c02640..2a951d25a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,14 +28,14 @@ OPTION(USE_GRAPHICAL_BENCHMARK "Use Graphical Benchmark" ON) OPTION(BUILD_SHARED_LIBS "Use shared libraries" OFF) OPTION(USE_SOFT_BODY_MULTI_BODY_DYNAMICS_WORLD "Use btSoftMultiBodyDynamicsWorld" ON) -OPTION(BULLET2_USE_THREAD_LOCKS "Build Bullet 2 libraries with mutex locking around certain operations (required for multi-threading)" OFF) -IF (BULLET2_USE_THREAD_LOCKS) +OPTION(BULLET2_MULTITHREADING "Build Bullet 2 libraries with mutex locking around certain operations (required for multi-threading)" OFF) +IF (BULLET2_MULTITHREADING) OPTION(BULLET2_USE_OPEN_MP_MULTITHREADING "Build Bullet 2 with support for multi-threading with OpenMP (requires a compiler with OpenMP support)" OFF) OPTION(BULLET2_USE_TBB_MULTITHREADING "Build Bullet 2 with support for multi-threading with Intel Threading Building Blocks (requires the TBB library to be already installed)" OFF) IF (MSVC) OPTION(BULLET2_USE_PPL_MULTITHREADING "Build Bullet 2 with support for multi-threading with Microsoft Parallel Patterns Library (requires MSVC compiler)" OFF) ENDIF (MSVC) -ENDIF (BULLET2_USE_THREAD_LOCKS) +ENDIF (BULLET2_MULTITHREADING) IF(NOT WIN32) @@ -225,12 +225,15 @@ IF(USE_GRAPHICAL_BENCHMARK) ADD_DEFINITIONS( -DUSE_GRAPHICAL_BENCHMARK) ENDIF (USE_GRAPHICAL_BENCHMARK) -IF(BULLET2_USE_THREAD_LOCKS) +IF(BULLET2_MULTITHREADING) ADD_DEFINITIONS( -DBT_THREADSAFE=1 ) IF (NOT MSVC) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") ENDIF (NOT MSVC) -ENDIF (BULLET2_USE_THREAD_LOCKS) + IF (NOT WIN32) + LINK_LIBRARIES( pthread ) + ENDIF (NOT WIN32) +ENDIF (BULLET2_MULTITHREADING) IF (BULLET2_USE_OPEN_MP_MULTITHREADING) ADD_DEFINITIONS("-DBT_USE_OPENMP=1") diff --git a/examples/ExampleBrowser/CMakeLists.txt b/examples/ExampleBrowser/CMakeLists.txt index cd50c7cf2..6bccf0d5c 100644 --- a/examples/ExampleBrowser/CMakeLists.txt +++ b/examples/ExampleBrowser/CMakeLists.txt @@ -226,7 +226,6 @@ SET(BulletExampleBrowser_SRCS ../MultiThreading/b3PosixThreadSupport.cpp ../MultiThreading/b3Win32ThreadSupport.cpp ../MultiThreading/b3ThreadSupportInterface.cpp - ../MultiThreading/btTaskScheduler.cpp ../RenderingExamples/TinyRendererSetup.cpp ../RenderingExamples/TimeSeriesCanvas.cpp ../RenderingExamples/TimeSeriesCanvas.h diff --git a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp index 1cf21db1e..b11cd7691 100644 --- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp +++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp @@ -29,17 +29,17 @@ class btCollisionShape; #include "BulletCollision/CollisionDispatch/btCollisionDispatcherMt.h" #include "BulletDynamics/Dynamics/btSimulationIslandManagerMt.h" // for setSplitIslands() #include "BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h" +#include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h" #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h" #include "BulletDynamics/ConstraintSolver/btNNCGConstraintSolver.h" #include "BulletDynamics/MLCPSolvers/btMLCPSolver.h" #include "BulletDynamics/MLCPSolvers/btSolveProjectedGaussSeidel.h" #include "BulletDynamics/MLCPSolvers/btDantzigSolver.h" #include "BulletDynamics/MLCPSolvers/btLemkeSolver.h" -#include "../MultiThreading/btTaskScheduler.h" static int gNumIslands = 0; - +bool gAllowNestedParallelForLoops = false; class Profiler { @@ -52,6 +52,10 @@ public: kRecordPredictUnconstrainedMotion, kRecordCreatePredictiveContacts, kRecordIntegrateTransforms, + kRecordSolverTotal, + kRecordSolverSetup, + kRecordSolverIterations, + kRecordSolverFinish, kRecordCount }; @@ -139,6 +143,41 @@ static void profileEndCallback( btDynamicsWorld *world, btScalar timeStep ) } +class MySequentialImpulseConstraintSolverMt : public btSequentialImpulseConstraintSolverMt +{ + typedef btSequentialImpulseConstraintSolverMt ParentClass; +public: + BT_DECLARE_ALIGNED_ALLOCATOR(); + + MySequentialImpulseConstraintSolverMt() {} + + // for profiling + virtual btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE + { + ProfileHelper prof(Profiler::kRecordSolverSetup); + btScalar ret = ParentClass::solveGroupCacheFriendlySetup(bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer ); + return ret; + } + virtual btScalar solveGroupCacheFriendlyIterations( btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& infoGlobal, btIDebugDraw* debugDrawer ) BT_OVERRIDE + { + ProfileHelper prof(Profiler::kRecordSolverIterations); + btScalar ret = ParentClass::solveGroupCacheFriendlyIterations(bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer ); + return ret; + } + virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal) BT_OVERRIDE + { + ProfileHelper prof(Profiler::kRecordSolverFinish); + btScalar ret = ParentClass::solveGroupCacheFriendlyFinish(bodies, numBodies, infoGlobal); + return ret; + } + virtual btScalar solveGroup(btCollisionObject** bodies, int numBodies, btPersistentManifold** manifold, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btDispatcher* dispatcher) BT_OVERRIDE + { + ProfileHelper prof(Profiler::kRecordSolverTotal); + btScalar ret = ParentClass::solveGroup(bodies, numBodies, manifold, numManifolds, constraints, numConstraints, info, debugDrawer, dispatcher); + return ret; + } +}; + /// /// MyCollisionDispatcher -- subclassed for profiling purposes /// @@ -218,6 +257,8 @@ btConstraintSolver* createSolverByType( SolverType t ) { case SOLVER_TYPE_SEQUENTIAL_IMPULSE: return new btSequentialImpulseConstraintSolver(); + case SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT: + return new MySequentialImpulseConstraintSolverMt(); case SOLVER_TYPE_NNCG: return new btNNCGConstraintSolver(); case SOLVER_TYPE_MLCP_PGS: @@ -253,7 +294,7 @@ public: { addTaskScheduler( btGetSequentialTaskScheduler() ); #if BT_THREADSAFE - if ( btITaskScheduler* ts = createDefaultTaskScheduler() ) + if ( btITaskScheduler* ts = btCreateDefaultTaskScheduler() ) { m_allocatedTaskSchedulers.push_back( ts ); addTaskScheduler( ts ); @@ -310,7 +351,7 @@ static bool gDisplayProfileInfo = true; static bool gMultithreadedWorld = false; static bool gDisplayProfileInfo = false; #endif -static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE; +static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT; static int gSolverMode = SOLVER_SIMD | SOLVER_USE_WARMSTARTING | // SOLVER_RANDMIZE_ORDER | @@ -318,9 +359,11 @@ static int gSolverMode = SOLVER_SIMD | // SOLVER_USE_2_FRICTION_DIRECTIONS | 0; static btScalar gSliderSolverIterations = 10.0f; // should be int - static btScalar gSliderNumThreads = 1.0f; // should be int - +static btScalar gSliderIslandBatchingThreshold = 0.0f; // should be int +static btScalar gSliderMinBatchSize = btScalar(btSequentialImpulseConstraintSolverMt::s_minBatchSize); // should be int +static btScalar gSliderMaxBatchSize = btScalar(btSequentialImpulseConstraintSolverMt::s_maxBatchSize); // should be int +static btScalar gSliderLeastSquaresResidualThreshold = 0.0f; //////////////////////////////////// CommonRigidBodyMTBase::CommonRigidBodyMTBase( struct GUIHelperInterface* helper ) @@ -419,6 +462,23 @@ void setTaskSchedulerComboBoxCallback(int combobox, const char* item, void* user } +void setBatchingMethodComboBoxCallback(int combobox, const char* item, void* userPointer) +{ +#if BT_THREADSAFE + const char** items = static_cast( userPointer ); + for ( int i = 0; i < btBatchedConstraints::BATCHING_METHOD_COUNT; ++i ) + { + if ( strcmp( item, items[ i ] ) == 0 ) + { + // change the task scheduler + btSequentialImpulseConstraintSolverMt::s_contactBatchingMethod = static_cast( i ); + break; + } + } +#endif // #if BT_THREADSAFE +} + + static void setThreadCountCallback(float val, void* userPtr) { #if BT_THREADSAFE @@ -435,13 +495,43 @@ static void setSolverIterationCountCallback(float val, void* userPtr) } } +static void setLargeIslandManifoldCountCallback( float val, void* userPtr ) +{ + btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching = int( gSliderIslandBatchingThreshold ); +} + +static void setMinBatchSizeCallback( float val, void* userPtr ) +{ + gSliderMaxBatchSize = (std::max)(gSliderMinBatchSize, gSliderMaxBatchSize); + btSequentialImpulseConstraintSolverMt::s_minBatchSize = int(gSliderMinBatchSize); + btSequentialImpulseConstraintSolverMt::s_maxBatchSize = int(gSliderMaxBatchSize); +} + +static void setMaxBatchSizeCallback( float val, void* userPtr ) +{ + gSliderMinBatchSize = (std::min)(gSliderMinBatchSize, gSliderMaxBatchSize); + btSequentialImpulseConstraintSolverMt::s_minBatchSize = int(gSliderMinBatchSize); + btSequentialImpulseConstraintSolverMt::s_maxBatchSize = int(gSliderMaxBatchSize); +} + +static void setLeastSquaresResidualThresholdCallback( float val, void* userPtr ) +{ + if (btDiscreteDynamicsWorld* world = reinterpret_cast(userPtr)) + { + world->getSolverInfo().m_leastSquaresResidualThreshold = gSliderLeastSquaresResidualThreshold; + } +} + void CommonRigidBodyMTBase::createEmptyDynamicsWorld() { gNumIslands = 0; m_solverType = gSolverType; -#if BT_THREADSAFE && (BT_USE_OPENMP || BT_USE_PPL || BT_USE_TBB) +#if BT_THREADSAFE btAssert( btGetTaskScheduler() != NULL ); - m_multithreadCapable = true; + if (NULL != btGetTaskScheduler() && gTaskSchedulerMgr.getNumTaskSchedulers() > 1) + { + m_multithreadCapable = true; + } #endif if ( gMultithreadedWorld ) { @@ -486,7 +576,12 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld() m_broadphase = new btDbvtBroadphase(); - m_solver = createSolverByType( m_solverType ); + SolverType solverType = m_solverType; + if ( solverType == SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT ) + { + solverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE; + } + m_solver = createSolverByType( solverType ); m_dynamicsWorld = new btDiscreteDynamicsWorld( m_dispatcher, m_broadphase, m_solver, m_collisionConfiguration ); } @@ -494,6 +589,7 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld() m_dynamicsWorld->setInternalTickCallback( profileEndCallback, NULL, false ); m_dynamicsWorld->setGravity( btVector3( 0, -10, 0 ) ); m_dynamicsWorld->getSolverInfo().m_solverMode = gSolverMode; + m_dynamicsWorld->getSolverInfo().m_numIterations = btMax(1, int(gSliderSolverIterations)); createDefaultParameters(); } @@ -504,16 +600,18 @@ void CommonRigidBodyMTBase::createDefaultParameters() { // create a button to toggle multithreaded world ButtonParams button( "Multithreaded world enable", 0, true ); - button.m_initialState = gMultithreadedWorld; - button.m_userPointer = &gMultithreadedWorld; + bool* ptr = &gMultithreadedWorld; + button.m_initialState = *ptr; + button.m_userPointer = ptr; button.m_callback = boolPtrButtonCallback; m_guiHelper->getParameterInterface()->registerButtonParameter( button ); } { // create a button to toggle profile printing ButtonParams button( "Display solver info", 0, true ); - button.m_initialState = gDisplayProfileInfo; - button.m_userPointer = &gDisplayProfileInfo; + bool* ptr = &gDisplayProfileInfo; + button.m_initialState = *ptr; + button.m_userPointer = ptr; button.m_callback = boolPtrButtonCallback; m_guiHelper->getParameterInterface()->registerButtonParameter( button ); } @@ -544,6 +642,16 @@ void CommonRigidBodyMTBase::createDefaultParameters() slider.m_clampToIntegers = true; m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider ); } + { + // a slider for the solver leastSquaresResidualThreshold (used to run fewer solver iterations when convergence is good) + SliderParams slider( "Solver residual thresh", &gSliderLeastSquaresResidualThreshold ); + slider.m_minVal = 0.0f; + slider.m_maxVal = 0.25f; + slider.m_callback = setLeastSquaresResidualThresholdCallback; + slider.m_userPointer = m_dynamicsWorld; + slider.m_clampToIntegers = false; + m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider ); + } { ButtonParams button( "Solver use SIMD", 0, true ); button.m_buttonId = SOLVER_SIMD; @@ -618,20 +726,86 @@ void CommonRigidBodyMTBase::createDefaultParameters() m_guiHelper->getParameterInterface()->registerComboBox( comboParams ); } { - // create a slider to set the number of threads to use - int numThreads = btGetTaskScheduler()->getNumThreads(); // if slider has not been set yet (by another demo), if ( gSliderNumThreads <= 1.0f ) { + // create a slider to set the number of threads to use + int numThreads = btGetTaskScheduler()->getNumThreads(); gSliderNumThreads = float( numThreads ); } + int maxNumThreads = btGetTaskScheduler()->getMaxNumThreads(); SliderParams slider("Thread count", &gSliderNumThreads); slider.m_minVal = 1.0f; - slider.m_maxVal = float( BT_MAX_THREAD_COUNT ); + slider.m_maxVal = float( maxNumThreads ); slider.m_callback = setThreadCountCallback; slider.m_clampToIntegers = true; m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider ); } + { + // a slider for the number of manifolds an island needs to be too large for parallel dispatch + if (gSliderIslandBatchingThreshold < 1.0) + { + gSliderIslandBatchingThreshold = float(btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching); + } + SliderParams slider( "IslandBatchThresh", &gSliderIslandBatchingThreshold ); + slider.m_minVal = 1.0f; + slider.m_maxVal = 2000.0f; + slider.m_callback = setLargeIslandManifoldCountCallback; + slider.m_userPointer = NULL; + slider.m_clampToIntegers = true; + m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider ); + } + { + // create a combo box for selecting the batching method + static const char* sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_COUNT ]; + { + sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_SPATIAL_GRID_2D ] = "Batching: 2D Grid"; + sBatchingMethodComboBoxItems[ btBatchedConstraints::BATCHING_METHOD_SPATIAL_GRID_3D ] = "Batching: 3D Grid"; + }; + ComboBoxParams comboParams; + comboParams.m_userPointer = sBatchingMethodComboBoxItems; + comboParams.m_numItems = btBatchedConstraints::BATCHING_METHOD_COUNT; + comboParams.m_startItem = static_cast(btSequentialImpulseConstraintSolverMt::s_contactBatchingMethod); + comboParams.m_items = sBatchingMethodComboBoxItems; + comboParams.m_callback = setBatchingMethodComboBoxCallback; + m_guiHelper->getParameterInterface()->registerComboBox( comboParams ); + } + { + // a slider for the sequentialImpulseConstraintSolverMt min batch size (when batching) + SliderParams slider( "Min batch size", &gSliderMinBatchSize ); + slider.m_minVal = 1.0f; + slider.m_maxVal = 1000.0f; + slider.m_callback = setMinBatchSizeCallback; + slider.m_userPointer = NULL; + slider.m_clampToIntegers = true; + m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider ); + } + { + // a slider for the sequentialImpulseConstraintSolverMt max batch size (when batching) + SliderParams slider( "Max batch size", &gSliderMaxBatchSize ); + slider.m_minVal = 1.0f; + slider.m_maxVal = 1000.0f; + slider.m_callback = setMaxBatchSizeCallback; + slider.m_userPointer = NULL; + slider.m_clampToIntegers = true; + m_guiHelper->getParameterInterface()->registerSliderFloatParameter( slider ); + } + { + // create a button to toggle debug drawing of batching visualization + ButtonParams button( "Visualize batching", 0, true ); + bool* ptr = &btBatchedConstraints::s_debugDrawBatches; + button.m_initialState = *ptr; + button.m_userPointer = ptr; + button.m_callback = boolPtrButtonCallback; + m_guiHelper->getParameterInterface()->registerButtonParameter( button ); + } + { + ButtonParams button( "Allow Nested ParallelFor", 0, true ); + button.m_initialState = btSequentialImpulseConstraintSolverMt::s_allowNestedParallelForLoops; + button.m_userPointer = &btSequentialImpulseConstraintSolverMt::s_allowNestedParallelForLoops; + button.m_callback = boolPtrButtonCallback; + m_guiHelper->getParameterInterface()->registerButtonParameter( button ); + } #endif // #if BT_THREADSAFE } } @@ -643,6 +817,7 @@ void CommonRigidBodyMTBase::drawScreenText() int xCoord = 400; int yCoord = 30; int yStep = 30; + int indent = 30; if (m_solverType != gSolverType) { sprintf( msg, "restart example to change solver type" ); @@ -721,6 +896,34 @@ void CommonRigidBodyMTBase::drawScreenText() m_guiHelper->getAppInterface()->drawText( msg, xCoord, yCoord, 0.4f ); yCoord += yStep; + sprintf( msg, + "SolverTotal %5.3f ms", + gProfiler.getAverageTime( Profiler::kRecordSolverTotal )*0.001f + ); + m_guiHelper->getAppInterface()->drawText( msg, xCoord, yCoord, 0.4f ); + yCoord += yStep; + + sprintf( msg, + "SolverSetup %5.3f ms", + gProfiler.getAverageTime( Profiler::kRecordSolverSetup )*0.001f + ); + m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f ); + yCoord += yStep; + + sprintf( msg, + "SolverIterations %5.3f ms", + gProfiler.getAverageTime( Profiler::kRecordSolverIterations )*0.001f + ); + m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f ); + yCoord += yStep; + + sprintf( msg, + "SolverFinish %5.3f ms", + gProfiler.getAverageTime( Profiler::kRecordSolverFinish )*0.001f + ); + m_guiHelper->getAppInterface()->drawText( msg, xCoord + indent, yCoord, 0.4f ); + yCoord += yStep; + sprintf( msg, "PredictUnconstrainedMotion %5.3f ms", gProfiler.getAverageTime( Profiler::kRecordPredictUnconstrainedMotion )*0.001f diff --git a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.h b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.h index 0695b88c0..c283a3f22 100644 --- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.h +++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.h @@ -14,6 +14,7 @@ enum SolverType { SOLVER_TYPE_SEQUENTIAL_IMPULSE, + SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT, SOLVER_TYPE_NNCG, SOLVER_TYPE_MLCP_PGS, SOLVER_TYPE_MLCP_DANTZIG, @@ -27,6 +28,7 @@ inline const char* getSolverTypeName( SolverType t ) switch (t) { case SOLVER_TYPE_SEQUENTIAL_IMPULSE: return "SequentialImpulse"; + case SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT: return "SequentialImpulseMt"; case SOLVER_TYPE_NNCG: return "NNCG"; case SOLVER_TYPE_MLCP_PGS: return "MLCP ProjectedGaussSeidel"; case SOLVER_TYPE_MLCP_DANTZIG: return "MLCP Dantzig"; diff --git a/examples/MultiThreadedDemo/MultiThreadedDemo.cpp b/examples/MultiThreadedDemo/MultiThreadedDemo.cpp index a04ab0d91..3cfcec807 100644 --- a/examples/MultiThreadedDemo/MultiThreadedDemo.cpp +++ b/examples/MultiThreadedDemo/MultiThreadedDemo.cpp @@ -25,10 +25,10 @@ subject to the following restrictions: -static btScalar gSliderStackRows = 8.0f; -static btScalar gSliderStackColumns = 6.0f; -static btScalar gSliderStackHeight = 10.0f; -static btScalar gSliderStackWidth = 1.0f; +static btScalar gSliderStackRows = 1.0f; +static btScalar gSliderStackColumns = 1.0f; +static btScalar gSliderStackHeight = 15.0f; +static btScalar gSliderStackWidth = 8.0f; static btScalar gSliderGroundHorizontalAmplitude = 0.0f; static btScalar gSliderGroundVerticalAmplitude = 0.0f; static btScalar gSliderGroundTilt = 0.0f; @@ -75,6 +75,21 @@ public: btScalar tilt = gSliderGroundTilt * SIMD_2_PI / 360.0f; return btQuaternion( btVector3( 1.0f, 0.0f, 0.0f ), tilt ); } + struct TestSumBody : public btIParallelSumBody + { + virtual btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + btScalar sum = 0.0f; + for (int i = iBegin; i < iEnd; ++i) + { + if (i > 0) + { + sum += 1.0f / btScalar(i); + } + } + return sum; + } + }; virtual void stepSimulation( float deltaTime ) BT_OVERRIDE { if ( m_dynamicsWorld ) @@ -115,6 +130,14 @@ public: // always step by 1/60 for benchmarking m_dynamicsWorld->stepSimulation( 1.0f / 60.0f, 0 ); } +#if 0 + { + // test parallelSum + TestSumBody testSumBody; + float testSum = btParallelSum( 1, 10000000, 10000, testSumBody ); + printf( "sum = %f\n", testSum ); + } +#endif } virtual void initPhysics() BT_OVERRIDE; diff --git a/examples/MultiThreading/btTaskScheduler.cpp b/examples/MultiThreading/btTaskScheduler.cpp deleted file mode 100644 index e6862a197..000000000 --- a/examples/MultiThreading/btTaskScheduler.cpp +++ /dev/null @@ -1,448 +0,0 @@ - -#include "LinearMath/btTransform.h" -#include "../Utils/b3Clock.h" -#include "LinearMath/btAlignedObjectArray.h" -#include "LinearMath/btThreads.h" -#include "LinearMath/btQuickprof.h" -#include -#include - - -typedef void( *btThreadFunc )( void* userPtr, void* lsMemory ); -typedef void* ( *btThreadLocalStorageFunc )(); - -#if BT_THREADSAFE - -#if defined( _WIN32 ) - -#include "b3Win32ThreadSupport.h" - -b3ThreadSupportInterface* createThreadSupport( int numThreads, btThreadFunc threadFunc, btThreadLocalStorageFunc localStoreFunc, const char* uniqueName ) -{ - b3Win32ThreadSupport::Win32ThreadConstructionInfo constructionInfo( uniqueName, threadFunc, localStoreFunc, numThreads ); - //constructionInfo.m_priority = 0; // highest priority (the default) -- can cause erratic performance when numThreads > numCores - // we don't want worker threads to be higher priority than the main thread or the main thread could get - // totally shut out and unable to tell the workers to stop - constructionInfo.m_priority = -1; // normal priority - b3Win32ThreadSupport* threadSupport = new b3Win32ThreadSupport( constructionInfo ); - return threadSupport; -} - -#else // #if defined( _WIN32 ) - -#include "b3PosixThreadSupport.h" - -b3ThreadSupportInterface* createThreadSupport( int numThreads, btThreadFunc threadFunc, btThreadLocalStorageFunc localStoreFunc, const char* uniqueName) -{ - b3PosixThreadSupport::ThreadConstructionInfo constructionInfo( uniqueName, threadFunc, localStoreFunc, numThreads ); - b3ThreadSupportInterface* threadSupport = new b3PosixThreadSupport( constructionInfo ); - return threadSupport; -} - -#endif // #else // #if defined( _WIN32 ) - - -/// -/// getNumHardwareThreads() -/// -/// -/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine -/// -#if __cplusplus >= 201103L - -#include - -int getNumHardwareThreads() -{ - return std::thread::hardware_concurrency(); -} - -#elif defined( _WIN32 ) - -#define WIN32_LEAN_AND_MEAN - -#include - -int getNumHardwareThreads() -{ - // caps out at 32 - SYSTEM_INFO info; - GetSystemInfo( &info ); - return info.dwNumberOfProcessors; -} - -#else - -int getNumHardwareThreads() -{ - return 0; // don't know -} - -#endif - - -struct WorkerThreadStatus -{ - enum Type - { - kInvalid, - kWaitingForWork, - kWorking, - kSleeping, - }; -}; - - -struct IJob -{ - virtual void executeJob() = 0; -}; - -class ParallelForJob : public IJob -{ - const btIParallelForBody* mBody; - int mBegin; - int mEnd; - -public: - ParallelForJob() - { - mBody = NULL; - mBegin = 0; - mEnd = 0; - } - void init( int iBegin, int iEnd, const btIParallelForBody& body ) - { - mBody = &body; - mBegin = iBegin; - mEnd = iEnd; - } - virtual void executeJob() BT_OVERRIDE - { - BT_PROFILE( "executeJob" ); - - // call the functor body to do the work - mBody->forLoop( mBegin, mEnd ); - } -}; - - -struct JobContext -{ - JobContext() - { - m_queueLock = NULL; - m_headIndex = 0; - m_tailIndex = 0; - m_workersShouldCheckQueue = false; - m_useSpinMutex = false; - } - b3CriticalSection* m_queueLock; - btSpinMutex m_mutex; - volatile bool m_workersShouldCheckQueue; - - btAlignedObjectArray m_jobQueue; - bool m_queueIsEmpty; - int m_tailIndex; - int m_headIndex; - bool m_useSpinMutex; - - void lockQueue() - { - if ( m_useSpinMutex ) - { - m_mutex.lock(); - } - else - { - m_queueLock->lock(); - } - } - void unlockQueue() - { - if ( m_useSpinMutex ) - { - m_mutex.unlock(); - } - else - { - m_queueLock->unlock(); - } - } - void clearQueue() - { - lockQueue(); - m_headIndex = 0; - m_tailIndex = 0; - m_queueIsEmpty = true; - unlockQueue(); - m_jobQueue.resizeNoInitialize( 0 ); - } - void submitJob( IJob* job ) - { - m_jobQueue.push_back( job ); - lockQueue(); - m_tailIndex++; - m_queueIsEmpty = false; - unlockQueue(); - } - IJob* consumeJob() - { - if ( m_queueIsEmpty ) - { - // lock free path. even if this is taken erroneously it isn't harmful - return NULL; - } - IJob* job = NULL; - lockQueue(); - if ( !m_queueIsEmpty ) - { - job = m_jobQueue[ m_headIndex++ ]; - if ( m_headIndex == m_tailIndex ) - { - m_queueIsEmpty = true; - } - } - unlockQueue(); - return job; - } -}; - - -struct WorkerThreadLocalStorage -{ - int threadId; - WorkerThreadStatus::Type status; -}; - - -static void WorkerThreadFunc( void* userPtr, void* lsMemory ) -{ - BT_PROFILE( "WorkerThreadFunc" ); - WorkerThreadLocalStorage* localStorage = (WorkerThreadLocalStorage*) lsMemory; - localStorage->status = WorkerThreadStatus::kWaitingForWork; - //printf( "WorkerThreadFunc: worker %d start working\n", localStorage->threadId ); - - JobContext* jobContext = (JobContext*) userPtr; - - while ( jobContext->m_workersShouldCheckQueue ) - { - if ( IJob* job = jobContext->consumeJob() ) - { - localStorage->status = WorkerThreadStatus::kWorking; - job->executeJob(); - localStorage->status = WorkerThreadStatus::kWaitingForWork; - } - else - { - // todo: spin wait a bit to avoid hammering the empty queue - } - } - - //printf( "WorkerThreadFunc stop working\n" ); - localStorage->status = WorkerThreadStatus::kSleeping; - // go idle -} - - -static void* WorkerThreadAllocFunc() -{ - return new WorkerThreadLocalStorage; -} - - - -class btTaskSchedulerDefault : public btITaskScheduler -{ - JobContext m_jobContext; - b3ThreadSupportInterface* m_threadSupport; - btAlignedObjectArray m_jobs; - btSpinMutex m_antiNestingLock; // prevent nested parallel-for - int m_numThreads; - int m_numWorkerThreads; - int m_numWorkersRunning; -public: - - btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport") - { - m_threadSupport = NULL; - m_numThreads = getNumHardwareThreads(); - // if can't detect number of cores, - if ( m_numThreads == 0 ) - { - // take a guess - m_numThreads = 4; - } - m_numWorkerThreads = m_numThreads - 1; - m_numWorkersRunning = 0; - } - - virtual ~btTaskSchedulerDefault() - { - shutdown(); - } - - void init() - { - int maxNumWorkerThreads = BT_MAX_THREAD_COUNT - 1; - m_threadSupport = createThreadSupport( maxNumWorkerThreads, WorkerThreadFunc, WorkerThreadAllocFunc, "TaskScheduler" ); - m_jobContext.m_queueLock = m_threadSupport->createCriticalSection(); - for ( int i = 0; i < maxNumWorkerThreads; i++ ) - { - WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i ); - btAssert( storage ); - storage->threadId = i; - storage->status = WorkerThreadStatus::kSleeping; - } - setWorkersActive( false ); // no work for them yet - } - - virtual void shutdown() - { - setWorkersActive( false ); - waitForWorkersToSleep(); - m_threadSupport->deleteCriticalSection( m_jobContext.m_queueLock ); - m_jobContext.m_queueLock = NULL; - - delete m_threadSupport; - m_threadSupport = NULL; - } - - void setWorkersActive( bool active ) - { - m_jobContext.m_workersShouldCheckQueue = active; - } - - virtual int getMaxNumThreads() const BT_OVERRIDE - { - return BT_MAX_THREAD_COUNT; - } - - virtual int getNumThreads() const BT_OVERRIDE - { - return m_numThreads; - } - - virtual void setNumThreads( int numThreads ) BT_OVERRIDE - { - m_numThreads = btMax( btMin(numThreads, int(BT_MAX_THREAD_COUNT)), 1 ); - m_numWorkerThreads = m_numThreads - 1; - } - - void waitJobs() - { - BT_PROFILE( "waitJobs" ); - // have the main thread work until the job queue is empty - for ( ;; ) - { - if ( IJob* job = m_jobContext.consumeJob() ) - { - job->executeJob(); - } - else - { - break; - } - } - // done with jobs for now, tell workers to rest - setWorkersActive( false ); - waitForWorkersToSleep(); - } - - void wakeWorkers() - { - BT_PROFILE( "wakeWorkers" ); - btAssert( m_jobContext.m_workersShouldCheckQueue ); - // tell each worker thread to start working - for ( int i = 0; i < m_numWorkerThreads; i++ ) - { - m_threadSupport->runTask( B3_THREAD_SCHEDULE_TASK, &m_jobContext, i ); - m_numWorkersRunning++; - } - } - - void waitForWorkersToSleep() - { - BT_PROFILE( "waitForWorkersToSleep" ); - while ( m_numWorkersRunning > 0 ) - { - int iThread; - int threadStatus; - m_threadSupport->waitForResponse( &iThread, &threadStatus ); // wait for worker threads to finish working - m_numWorkersRunning--; - } - //m_threadSupport->waitForAllTasksToComplete(); - for ( int i = 0; i < m_numWorkerThreads; i++ ) - { - //m_threadSupport->waitForTaskCompleted( i ); - WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i ); - btAssert( storage ); - btAssert( storage->status == WorkerThreadStatus::kSleeping ); - } - } - - virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE - { - BT_PROFILE( "parallelFor_ThreadSupport" ); - btAssert( iEnd >= iBegin ); - btAssert( grainSize >= 1 ); - int iterationCount = iEnd - iBegin; - if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() ) - { - int jobCount = ( iterationCount + grainSize - 1 ) / grainSize; - btAssert( jobCount >= 2 ); // need more than one job for multithreading - if ( jobCount > m_jobs.size() ) - { - m_jobs.resize( jobCount ); - } - if ( jobCount > m_jobContext.m_jobQueue.capacity() ) - { - m_jobContext.m_jobQueue.reserve( jobCount ); - } - - m_jobContext.clearQueue(); - // prepare worker threads for incoming work - setWorkersActive( true ); - wakeWorkers(); - // submit all of the jobs - int iJob = 0; - for ( int i = iBegin; i < iEnd; i += grainSize ) - { - btAssert( iJob < jobCount ); - int iE = btMin( i + grainSize, iEnd ); - ParallelForJob& job = m_jobs[ iJob ]; - job.init( i, iE, body ); - m_jobContext.submitJob( &job ); - iJob++; - } - - // put the main thread to work on emptying the job queue and then wait for all workers to finish - waitJobs(); - m_antiNestingLock.unlock(); - } - else - { - BT_PROFILE( "parallelFor_mainThread" ); - // just run on main thread - body.forLoop( iBegin, iEnd ); - } - } -}; - - - -btITaskScheduler* createDefaultTaskScheduler() -{ - btTaskSchedulerDefault* ts = new btTaskSchedulerDefault(); - ts->init(); - return ts; -} - -#else // #if BT_THREADSAFE - -btITaskScheduler* createDefaultTaskScheduler() -{ - return NULL; -} - -#endif // #else // #if BT_THREADSAFE \ No newline at end of file diff --git a/examples/MultiThreading/btTaskScheduler.h b/examples/MultiThreading/btTaskScheduler.h deleted file mode 100644 index a83b635eb..000000000 --- a/examples/MultiThreading/btTaskScheduler.h +++ /dev/null @@ -1,26 +0,0 @@ -/* -Copyright (c) 2003-2014 Erwin Coumans http://bullet.googlecode.com - -This software is provided 'as-is', without any express or implied warranty. -In no event will the authors be held liable for any damages arising from the use of this software. -Permission is granted to anyone to use this software for any purpose, -including commercial applications, and to alter it and redistribute it freely, -subject to the following restrictions: - -1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. -2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. -3. This notice may not be removed or altered from any source distribution. -*/ - - - -#ifndef BT_TASK_SCHEDULER_H -#define BT_TASK_SCHEDULER_H - - -class btITaskScheduler; - -btITaskScheduler* createDefaultTaskScheduler(); - - -#endif // BT_TASK_SCHEDULER_H diff --git a/src/BulletDynamics/CMakeLists.txt b/src/BulletDynamics/CMakeLists.txt index f8a6f34ba..2eb03c39a 100644 --- a/src/BulletDynamics/CMakeLists.txt +++ b/src/BulletDynamics/CMakeLists.txt @@ -15,6 +15,8 @@ SET(BulletDynamics_SRCS ConstraintSolver/btHingeConstraint.cpp ConstraintSolver/btPoint2PointConstraint.cpp ConstraintSolver/btSequentialImpulseConstraintSolver.cpp + ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp + ConstraintSolver/btBatchedConstraints.cpp ConstraintSolver/btNNCGConstraintSolver.cpp ConstraintSolver/btSliderConstraint.cpp ConstraintSolver/btSolve2LinearConstraint.cpp @@ -62,6 +64,7 @@ SET(ConstraintSolver_HDRS ConstraintSolver/btJacobianEntry.h ConstraintSolver/btPoint2PointConstraint.h ConstraintSolver/btSequentialImpulseConstraintSolver.h + ConstraintSolver/btSequentialImpulseConstraintSolverMt.h ConstraintSolver/btNNCGConstraintSolver.h ConstraintSolver/btSliderConstraint.h ConstraintSolver/btSolve2LinearConstraint.h diff --git a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp new file mode 100644 index 000000000..bc840e889 --- /dev/null +++ b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp @@ -0,0 +1,1129 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + +#include "btBatchedConstraints.h" + +#include "LinearMath/btIDebugDraw.h" +#include "LinearMath/btMinMax.h" +#include "LinearMath/btStackAlloc.h" +#include "LinearMath/btQuickprof.h" + +#include //for memset + +const int kNoMerge = -1; + +bool btBatchedConstraints::s_debugDrawBatches = false; + + +struct btBatchedConstraintInfo +{ + int constraintIndex; + int numConstraintRows; + int bodyIds[2]; +}; + + +struct btBatchInfo +{ + int phaseId; + int numConstraints; + int mergeIndex; + + btBatchInfo(int _phaseId = -1) : numConstraints(0), mergeIndex(-1), phaseId(_phaseId) {} +}; + + +bool btBatchedConstraints::validate(btConstraintArray* constraints, const btAlignedObjectArray& bodies) const +{ + // + // validate: for debugging only. Verify coloring of bodies, that no body is touched by more than one batch in any given phase + // + int errors = 0; + const int kUnassignedBatch = -1; + + btAlignedObjectArray bodyBatchId; + for (int iPhase = 0; iPhase < m_phases.size(); ++iPhase) + { + bodyBatchId.resizeNoInitialize(0); + bodyBatchId.resize( bodies.size(), kUnassignedBatch ); + const Range& phase = m_phases[iPhase]; + for (int iBatch = phase.begin; iBatch < phase.end; ++iBatch) + { + const Range& batch = m_batches[iBatch]; + for (int iiCons = batch.begin; iiCons < batch.end; ++iiCons) + { + int iCons = m_constraintIndices[iiCons]; + const btSolverConstraint& cons = constraints->at(iCons); + const btSolverBody& bodyA = bodies[cons.m_solverBodyIdA]; + const btSolverBody& bodyB = bodies[cons.m_solverBodyIdB]; + if (! bodyA.internalGetInvMass().isZero()) + { + int thisBodyBatchId = bodyBatchId[cons.m_solverBodyIdA]; + if (thisBodyBatchId == kUnassignedBatch) + { + bodyBatchId[cons.m_solverBodyIdA] = iBatch; + } + else if (thisBodyBatchId != iBatch) + { + btAssert( !"dynamic body is used in 2 different batches in the same phase" ); + errors++; + } + } + if (! bodyB.internalGetInvMass().isZero()) + { + int thisBodyBatchId = bodyBatchId[cons.m_solverBodyIdB]; + if (thisBodyBatchId == kUnassignedBatch) + { + bodyBatchId[cons.m_solverBodyIdB] = iBatch; + } + else if (thisBodyBatchId != iBatch) + { + btAssert( !"dynamic body is used in 2 different batches in the same phase" ); + errors++; + } + } + } + } + } + return errors == 0; +} + + +static void debugDrawSingleBatch( const btBatchedConstraints* bc, + btConstraintArray* constraints, + const btAlignedObjectArray& bodies, + int iBatch, + const btVector3& color, + const btVector3& offset + ) +{ + if (bc && bc->m_debugDrawer && iBatch < bc->m_batches.size()) + { + const btBatchedConstraints::Range& b = bc->m_batches[iBatch]; + for (int iiCon = b.begin; iiCon < b.end; ++iiCon) + { + int iCon = bc->m_constraintIndices[iiCon]; + const btSolverConstraint& con = constraints->at(iCon); + int iBody0 = con.m_solverBodyIdA; + int iBody1 = con.m_solverBodyIdB; + btVector3 pos0 = bodies[iBody0].getWorldTransform().getOrigin() + offset; + btVector3 pos1 = bodies[iBody1].getWorldTransform().getOrigin() + offset; + bc->m_debugDrawer->drawLine(pos0, pos1, color); + } + } +} + + +static void debugDrawPhase( const btBatchedConstraints* bc, + btConstraintArray* constraints, + const btAlignedObjectArray& bodies, + int iPhase, + const btVector3& color0, + const btVector3& color1, + const btVector3& offset + ) +{ + BT_PROFILE( "debugDrawPhase" ); + if ( bc && bc->m_debugDrawer && iPhase < bc->m_phases.size() ) + { + const btBatchedConstraints::Range& phase = bc->m_phases[iPhase]; + for (int iBatch = phase.begin; iBatch < phase.end; ++iBatch) + { + float tt = float(iBatch - phase.begin) / float(btMax(1, phase.end - phase.begin - 1)); + btVector3 col = lerp(color0, color1, tt); + debugDrawSingleBatch(bc, constraints, bodies, iBatch, col, offset); + } + } +} + + +static void debugDrawAllBatches( const btBatchedConstraints* bc, + btConstraintArray* constraints, + const btAlignedObjectArray& bodies + ) +{ + BT_PROFILE( "debugDrawAllBatches" ); + if ( bc && bc->m_debugDrawer && bc->m_phases.size() > 0 ) + { + btVector3 bboxMin(BT_LARGE_FLOAT, BT_LARGE_FLOAT, BT_LARGE_FLOAT); + btVector3 bboxMax = -bboxMin; + for (int iBody = 0; iBody < bodies.size(); ++iBody) + { + const btVector3& pos = bodies[iBody].getWorldTransform().getOrigin(); + bboxMin.setMin(pos); + bboxMax.setMax(pos); + } + btVector3 bboxExtent = bboxMax - bboxMin; + btVector3 offsetBase = btVector3( 0, bboxExtent.y()*1.1f, 0 ); + btVector3 offsetStep = btVector3( 0, 0, bboxExtent.z()*1.1f ); + int numPhases = bc->m_phases.size(); + for (int iPhase = 0; iPhase < numPhases; ++iPhase) + { + float b = float(iPhase)/float(numPhases-1); + btVector3 color0 = btVector3(1,0,b); + btVector3 color1 = btVector3(0,1,b); + btVector3 offset = offsetBase + offsetStep*(float(iPhase) - float(numPhases-1)*0.5); + debugDrawPhase(bc, constraints, bodies, iPhase, color0, color1, offset); + } + } +} + + +static void initBatchedBodyDynamicFlags(btAlignedObjectArray* outBodyDynamicFlags, const btAlignedObjectArray& bodies) +{ + BT_PROFILE("initBatchedBodyDynamicFlags"); + btAlignedObjectArray& bodyDynamicFlags = *outBodyDynamicFlags; + bodyDynamicFlags.resizeNoInitialize(bodies.size()); + for (int i = 0; i < bodies.size(); ++i) + { + const btSolverBody& body = bodies[ i ]; + bodyDynamicFlags[i] = ( body.internalGetInvMass().x() > btScalar( 0 ) ); + } +} + + +static int runLengthEncodeConstraintInfo(btBatchedConstraintInfo* outConInfos, int numConstraints) +{ + BT_PROFILE("runLengthEncodeConstraintInfo"); + // detect and run-length encode constraint rows that repeat the same bodies + int iDest = 0; + int iSrc = 0; + while (iSrc < numConstraints) + { + const btBatchedConstraintInfo& srcConInfo = outConInfos[iSrc]; + btBatchedConstraintInfo& conInfo = outConInfos[iDest]; + conInfo.constraintIndex = iSrc; + conInfo.bodyIds[0] = srcConInfo.bodyIds[0]; + conInfo.bodyIds[1] = srcConInfo.bodyIds[1]; + while (iSrc < numConstraints && outConInfos[iSrc].bodyIds[0] == srcConInfo.bodyIds[0] && outConInfos[iSrc].bodyIds[1] == srcConInfo.bodyIds[1]) + { + ++iSrc; + } + conInfo.numConstraintRows = iSrc - conInfo.constraintIndex; + btAssert( conInfo.numConstraintRows <= 6 ); + ++iDest; + } + return iDest; +} + + +struct ReadSolverConstraintsLoop : public btIParallelForBody +{ + btBatchedConstraintInfo* m_outConInfos; + btConstraintArray* m_constraints; + + ReadSolverConstraintsLoop( btBatchedConstraintInfo* outConInfos, btConstraintArray* constraints ) + { + m_outConInfos = outConInfos; + m_constraints = constraints; + } + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + for (int i = iBegin; i < iEnd; ++i) + { + btBatchedConstraintInfo& conInfo = m_outConInfos[i]; + const btSolverConstraint& con = m_constraints->at( i ); + conInfo.bodyIds[0] = con.m_solverBodyIdA; + conInfo.bodyIds[1] = con.m_solverBodyIdB; + conInfo.constraintIndex = i; + conInfo.numConstraintRows = 1; + } + } +}; + + +static int initBatchedConstraintInfo(btBatchedConstraintInfo* outConInfos, btConstraintArray* constraints) +{ + BT_PROFILE("initBatchedConstraintInfo"); + int numConstraints = constraints->size(); + bool inParallel = true; + if (inParallel) + { + ReadSolverConstraintsLoop loop(outConInfos, constraints); + int grainSize = 1200; + btParallelFor(0, numConstraints, grainSize, loop); + } + else + { + for (int i = 0; i < numConstraints; ++i) + { + btBatchedConstraintInfo& conInfo = outConInfos[i]; + const btSolverConstraint& con = constraints->at( i ); + conInfo.bodyIds[0] = con.m_solverBodyIdA; + conInfo.bodyIds[1] = con.m_solverBodyIdB; + conInfo.constraintIndex = i; + conInfo.numConstraintRows = 1; + } + } + bool useRunLengthEncoding = true; + if (useRunLengthEncoding) + { + numConstraints = runLengthEncodeConstraintInfo(outConInfos, numConstraints); + } + return numConstraints; +} + + +static void expandConstraintRowsInPlace(int* constraintBatchIds, const btBatchedConstraintInfo* conInfos, int numConstraints, int numConstraintRows) +{ + BT_PROFILE("expandConstraintRowsInPlace"); + if (numConstraintRows > numConstraints) + { + // we walk the array in reverse to avoid overwriteing + for (int iCon = numConstraints - 1; iCon >= 0; --iCon) + { + const btBatchedConstraintInfo& conInfo = conInfos[iCon]; + int iBatch = constraintBatchIds[iCon]; + for (int i = conInfo.numConstraintRows - 1; i >= 0; --i) + { + int iDest = conInfo.constraintIndex + i; + btAssert(iDest >= iCon); + btAssert(iDest >= 0 && iDest < numConstraintRows); + constraintBatchIds[iDest] = iBatch; + } + } + } +} + + +static void expandConstraintRows(int* destConstraintBatchIds, const int* srcConstraintBatchIds, const btBatchedConstraintInfo* conInfos, int numConstraints, int numConstraintRows) +{ + BT_PROFILE("expandConstraintRows"); + for ( int iCon = 0; iCon < numConstraints; ++iCon ) + { + const btBatchedConstraintInfo& conInfo = conInfos[ iCon ]; + int iBatch = srcConstraintBatchIds[ iCon ]; + for ( int i = 0; i < conInfo.numConstraintRows; ++i ) + { + int iDest = conInfo.constraintIndex + i; + btAssert( iDest >= iCon ); + btAssert( iDest >= 0 && iDest < numConstraintRows ); + destConstraintBatchIds[ iDest ] = iBatch; + } + } +} + + +struct ExpandConstraintRowsLoop : public btIParallelForBody +{ + int* m_destConstraintBatchIds; + const int* m_srcConstraintBatchIds; + const btBatchedConstraintInfo* m_conInfos; + int m_numConstraintRows; + + ExpandConstraintRowsLoop( int* destConstraintBatchIds, const int* srcConstraintBatchIds, const btBatchedConstraintInfo* conInfos, int numConstraintRows) + { + m_destConstraintBatchIds = destConstraintBatchIds; + m_srcConstraintBatchIds = srcConstraintBatchIds; + m_conInfos = conInfos; + m_numConstraintRows = numConstraintRows; + } + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + expandConstraintRows(m_destConstraintBatchIds, m_srcConstraintBatchIds + iBegin, m_conInfos + iBegin, iEnd - iBegin, m_numConstraintRows); + } +}; + + +static void expandConstraintRowsMt(int* destConstraintBatchIds, const int* srcConstraintBatchIds, const btBatchedConstraintInfo* conInfos, int numConstraints, int numConstraintRows) +{ + BT_PROFILE("expandConstraintRowsMt"); + ExpandConstraintRowsLoop loop(destConstraintBatchIds, srcConstraintBatchIds, conInfos, numConstraintRows); + int grainSize = 600; + btParallelFor(0, numConstraints, grainSize, loop); +} + + +static void initBatchedConstraintInfoArray(btAlignedObjectArray* outConInfos, btConstraintArray* constraints) +{ + BT_PROFILE("initBatchedConstraintInfoArray"); + btAlignedObjectArray& conInfos = *outConInfos; + int numConstraints = constraints->size(); + conInfos.resizeNoInitialize(numConstraints); + + int newSize = initBatchedConstraintInfo(&outConInfos->at(0), constraints); + conInfos.resizeNoInitialize(newSize); +} + + +static void mergeSmallBatches(btBatchInfo* batches, int iBeginBatch, int iEndBatch, int minBatchSize, int maxBatchSize) +{ + BT_PROFILE("mergeSmallBatches"); + for ( int iBatch = iEndBatch - 1; iBatch >= iBeginBatch; --iBatch ) + { + btBatchInfo& batch = batches[ iBatch ]; + if ( batch.mergeIndex == kNoMerge && batch.numConstraints > 0 && batch.numConstraints < minBatchSize ) + { + for ( int iDestBatch = iBatch - 1; iDestBatch >= iBeginBatch; --iDestBatch ) + { + btBatchInfo& destBatch = batches[ iDestBatch ]; + if ( destBatch.mergeIndex == kNoMerge && ( destBatch.numConstraints + batch.numConstraints ) < maxBatchSize ) + { + destBatch.numConstraints += batch.numConstraints; + batch.numConstraints = 0; + batch.mergeIndex = iDestBatch; + break; + } + } + } + } + // flatten mergeIndexes + // e.g. in case where A was merged into B and then B was merged into C, we need A to point to C instead of B + // Note: loop goes forward through batches because batches always merge from higher indexes to lower, + // so by going from low to high it reduces the amount of trail-following + for ( int iBatch = iBeginBatch; iBatch < iEndBatch; ++iBatch ) + { + btBatchInfo& batch = batches[ iBatch ]; + if ( batch.mergeIndex != kNoMerge ) + { + int iMergeDest = batches[ batch.mergeIndex ].mergeIndex; + // follow trail of merges to the end + while ( iMergeDest != kNoMerge ) + { + int iNext = batches[ iMergeDest ].mergeIndex; + if ( iNext == kNoMerge ) + { + batch.mergeIndex = iMergeDest; + break; + } + iMergeDest = iNext; + } + } + } +} + + +static void updateConstraintBatchIdsForMerges(int* constraintBatchIds, int numConstraints, const btBatchInfo* batches, int numBatches) +{ + BT_PROFILE("updateConstraintBatchIdsForMerges"); + // update batchIds to account for merges + for (int i = 0; i < numConstraints; ++i) + { + int iBatch = constraintBatchIds[i]; + btAssert(iBatch < numBatches); + // if this constraint references a batch that was merged into another batch + if (batches[iBatch].mergeIndex != kNoMerge) + { + // update batchId + constraintBatchIds[i] = batches[iBatch].mergeIndex; + } + } +} + + +struct UpdateConstraintBatchIdsForMergesLoop : public btIParallelForBody +{ + int* m_constraintBatchIds; + const btBatchInfo* m_batches; + int m_numBatches; + + UpdateConstraintBatchIdsForMergesLoop( int* constraintBatchIds, const btBatchInfo* batches, int numBatches ) + { + m_constraintBatchIds = constraintBatchIds; + m_batches = batches; + m_numBatches = numBatches; + } + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + BT_PROFILE( "UpdateConstraintBatchIdsForMergesLoop" ); + updateConstraintBatchIdsForMerges( m_constraintBatchIds + iBegin, iEnd - iBegin, m_batches, m_numBatches ); + } +}; + + +static void updateConstraintBatchIdsForMergesMt(int* constraintBatchIds, int numConstraints, const btBatchInfo* batches, int numBatches) +{ + BT_PROFILE( "updateConstraintBatchIdsForMergesMt" ); + UpdateConstraintBatchIdsForMergesLoop loop(constraintBatchIds, batches, numBatches); + int grainSize = 800; + btParallelFor(0, numConstraints, grainSize, loop); +} + + +inline bool BatchCompare(const btBatchedConstraints::Range& a, const btBatchedConstraints::Range& b) +{ + int lenA = a.end - a.begin; + int lenB = b.end - b.begin; + return lenA > lenB; +} + + +static void writeOutConstraintIndicesForRangeOfBatches(btBatchedConstraints* bc, + const int* constraintBatchIds, + int numConstraints, + int* constraintIdPerBatch, + int batchBegin, + int batchEnd + ) +{ + BT_PROFILE("writeOutConstraintIndicesForRangeOfBatches"); + for ( int iCon = 0; iCon < numConstraints; ++iCon ) + { + int iBatch = constraintBatchIds[ iCon ]; + if (iBatch >= batchBegin && iBatch < batchEnd) + { + int iDestCon = constraintIdPerBatch[ iBatch ]; + constraintIdPerBatch[ iBatch ] = iDestCon + 1; + bc->m_constraintIndices[ iDestCon ] = iCon; + } + } +} + + +struct WriteOutConstraintIndicesLoop : public btIParallelForBody +{ + btBatchedConstraints* m_batchedConstraints; + const int* m_constraintBatchIds; + int m_numConstraints; + int* m_constraintIdPerBatch; + int m_maxNumBatchesPerPhase; + + WriteOutConstraintIndicesLoop( btBatchedConstraints* bc, const int* constraintBatchIds, int numConstraints, int* constraintIdPerBatch, int maxNumBatchesPerPhase ) + { + m_batchedConstraints = bc; + m_constraintBatchIds = constraintBatchIds; + m_numConstraints = numConstraints; + m_constraintIdPerBatch = constraintIdPerBatch; + m_maxNumBatchesPerPhase = maxNumBatchesPerPhase; + } + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + BT_PROFILE( "WriteOutConstraintIndicesLoop" ); + int batchBegin = iBegin * m_maxNumBatchesPerPhase; + int batchEnd = iEnd * m_maxNumBatchesPerPhase; + writeOutConstraintIndicesForRangeOfBatches(m_batchedConstraints, + m_constraintBatchIds, + m_numConstraints, + m_constraintIdPerBatch, + batchBegin, + batchEnd + ); + } +}; + + +static void writeOutConstraintIndicesMt(btBatchedConstraints* bc, + const int* constraintBatchIds, + int numConstraints, + int* constraintIdPerBatch, + int maxNumBatchesPerPhase, + int numPhases + ) +{ + BT_PROFILE("writeOutConstraintIndicesMt"); + bool inParallel = true; + if (inParallel) + { + WriteOutConstraintIndicesLoop loop( bc, constraintBatchIds, numConstraints, constraintIdPerBatch, maxNumBatchesPerPhase ); + btParallelFor( 0, numPhases, 1, loop ); + } + else + { + for ( int iCon = 0; iCon < numConstraints; ++iCon ) + { + int iBatch = constraintBatchIds[ iCon ]; + int iDestCon = constraintIdPerBatch[ iBatch ]; + constraintIdPerBatch[ iBatch ] = iDestCon + 1; + bc->m_constraintIndices[ iDestCon ] = iCon; + } + } +} + + +static void writeGrainSizes(btBatchedConstraints* bc) +{ + typedef btBatchedConstraints::Range Range; + int numPhases = bc->m_phases.size(); + bc->m_phaseGrainSize.resizeNoInitialize(numPhases); + int numThreads = btGetTaskScheduler()->getNumThreads(); + for (int iPhase = 0; iPhase < numPhases; ++iPhase) + { + const Range& phase = bc->m_phases[ iPhase ]; + int numBatches = phase.end - phase.begin; + float grainSize = floor((0.25f*numBatches / float(numThreads)) + 0.0f); + bc->m_phaseGrainSize[ iPhase ] = btMax(1, int(grainSize)); + } +} + + +static void writeOutBatches(btBatchedConstraints* bc, + const int* constraintBatchIds, + int numConstraints, + const btBatchInfo* batches, + int* batchWork, + int maxNumBatchesPerPhase, + int numPhases +) +{ + BT_PROFILE("writeOutBatches"); + typedef btBatchedConstraints::Range Range; + bc->m_constraintIndices.reserve( numConstraints ); + bc->m_batches.resizeNoInitialize( 0 ); + bc->m_phases.resizeNoInitialize( 0 ); + + //int maxNumBatches = numPhases * maxNumBatchesPerPhase; + { + int* constraintIdPerBatch = batchWork; // for each batch, keep an index into the next available slot in the m_constraintIndices array + int iConstraint = 0; + for (int iPhase = 0; iPhase < numPhases; ++iPhase) + { + int curPhaseBegin = bc->m_batches.size(); + int iBegin = iPhase * maxNumBatchesPerPhase; + int iEnd = iBegin + maxNumBatchesPerPhase; + for ( int i = iBegin; i < iEnd; ++i ) + { + const btBatchInfo& batch = batches[ i ]; + int curBatchBegin = iConstraint; + constraintIdPerBatch[ i ] = curBatchBegin; // record the start of each batch in m_constraintIndices array + int numConstraints = batch.numConstraints; + iConstraint += numConstraints; + if ( numConstraints > 0 ) + { + bc->m_batches.push_back( Range( curBatchBegin, iConstraint ) ); + } + } + // if any batches were emitted this phase, + if ( bc->m_batches.size() > curPhaseBegin ) + { + // output phase + bc->m_phases.push_back( Range( curPhaseBegin, bc->m_batches.size() ) ); + } + } + + btAssert(iConstraint == numConstraints); + bc->m_constraintIndices.resizeNoInitialize( numConstraints ); + writeOutConstraintIndicesMt( bc, constraintBatchIds, numConstraints, constraintIdPerBatch, maxNumBatchesPerPhase, numPhases ); + } + // for each phase + for (int iPhase = 0; iPhase < bc->m_phases.size(); ++iPhase) + { + // sort the batches from largest to smallest (can be helpful to some task schedulers) + const Range& curBatches = bc->m_phases[iPhase]; + bc->m_batches.quickSortInternal(BatchCompare, curBatches.begin, curBatches.end-1); + } + bc->m_phaseOrder.resize(bc->m_phases.size()); + for (int i = 0; i < bc->m_phases.size(); ++i) + { + bc->m_phaseOrder[i] = i; + } + writeGrainSizes(bc); +} + + +// +// PreallocatedMemoryHelper -- helper object for allocating a number of chunks of memory in a single contiguous block. +// It is generally more efficient to do a single larger allocation than many smaller allocations. +// +// Example Usage: +// +// btVector3* bodyPositions = NULL; +// btBatchedConstraintInfo* conInfos = NULL; +// { +// PreallocatedMemoryHelper<8> memHelper; +// memHelper.addChunk( (void**) &bodyPositions, sizeof( btVector3 ) * bodies.size() ); +// memHelper.addChunk( (void**) &conInfos, sizeof( btBatchedConstraintInfo ) * numConstraints ); +// void* memPtr = malloc( memHelper.getSizeToAllocate() ); // allocate the memory +// memHelper.setChunkPointers( memPtr ); // update pointers to chunks +// } +template +class PreallocatedMemoryHelper +{ + struct Chunk + { + void** ptr; + size_t size; + }; + Chunk m_chunks[N]; + int m_numChunks; +public: + PreallocatedMemoryHelper() {m_numChunks=0;} + void addChunk( void** ptr, size_t sz ) + { + btAssert( m_numChunks < N ); + if ( m_numChunks < N ) + { + Chunk& chunk = m_chunks[ m_numChunks ]; + chunk.ptr = ptr; + chunk.size = sz; + m_numChunks++; + } + } + size_t getSizeToAllocate() const + { + size_t totalSize = 0; + for (int i = 0; i < m_numChunks; ++i) + { + totalSize += m_chunks[i].size; + } + return totalSize; + } + void setChunkPointers(void* mem) const + { + size_t totalSize = 0; + for (int i = 0; i < m_numChunks; ++i) + { + const Chunk& chunk = m_chunks[ i ]; + char* chunkPtr = static_cast(mem) + totalSize; + *chunk.ptr = chunkPtr; + totalSize += chunk.size; + } + } +}; + + + +static btVector3 findMaxDynamicConstraintExtent( + btVector3* bodyPositions, + bool* bodyDynamicFlags, + btBatchedConstraintInfo* conInfos, + int numConstraints, + int numBodies + ) +{ + BT_PROFILE("findMaxDynamicConstraintExtent"); + btVector3 consExtent = btVector3(1,1,1) * 0.001; + for (int iCon = 0; iCon < numConstraints; ++iCon) + { + const btBatchedConstraintInfo& con = conInfos[ iCon ]; + int iBody0 = con.bodyIds[0]; + int iBody1 = con.bodyIds[1]; + btAssert(iBody0 >= 0 && iBody0 < numBodies); + btAssert(iBody1 >= 0 && iBody1 < numBodies); + // is it a dynamic constraint? + if (bodyDynamicFlags[iBody0] && bodyDynamicFlags[iBody1]) + { + btVector3 delta = bodyPositions[iBody1] - bodyPositions[iBody0]; + consExtent.setMax(delta.absolute()); + } + } + return consExtent; +} + + +struct btIntVec3 +{ + int m_ints[ 3 ]; + + SIMD_FORCE_INLINE const int& operator[](int i) const {return m_ints[i];} + SIMD_FORCE_INLINE int& operator[](int i) {return m_ints[i];} +}; + + +struct AssignConstraintsToGridBatchesParams +{ + bool* bodyDynamicFlags; + btIntVec3* bodyGridCoords; + int numBodies; + btBatchedConstraintInfo* conInfos; + char* constraintPhaseIds; + int* constraintBatchIds; + btIntVec3 gridChunkDim; + int maxNumBatchesPerPhase; + int numPhases; + int phaseMask; + + AssignConstraintsToGridBatchesParams() + { + memset(this, 0, sizeof(*this)); + } +}; + + +static void assignConstraintsToGridBatches(const AssignConstraintsToGridBatchesParams& params, int iConBegin, int iConEnd) +{ + BT_PROFILE("assignConstraintsToGridBatches"); + // (can be done in parallel) + for ( int iCon = iConBegin; iCon < iConEnd; ++iCon ) + { + const btBatchedConstraintInfo& con = params.conInfos[ iCon ]; + int iBody0 = con.bodyIds[ 0 ]; + int iBody1 = con.bodyIds[ 1 ]; + int iPhase = iCon; //iBody0; // pseudorandom choice to distribute evenly amongst phases + iPhase &= params.phaseMask; + int gridCoord[ 3 ]; + // is it a dynamic constraint? + if ( params.bodyDynamicFlags[ iBody0 ] && params.bodyDynamicFlags[ iBody1 ] ) + { + const btIntVec3& body0Coords = params.bodyGridCoords[iBody0]; + const btIntVec3& body1Coords = params.bodyGridCoords[iBody1]; + // for each dimension x,y,z, + for (int i = 0; i < 3; ++i) + { + int coordMin = btMin(body0Coords.m_ints[i], body1Coords.m_ints[i]); + int coordMax = btMax(body0Coords.m_ints[i], body1Coords.m_ints[i]); + if (coordMin != coordMax) + { + btAssert( coordMax == coordMin + 1 ); + if ((coordMin&1) == 0) + { + iPhase &= ~(1 << i); // force bit off + } + else + { + iPhase |= (1 << i); // force bit on + iPhase &= params.phaseMask; + } + } + gridCoord[ i ] = coordMin; + } + } + else + { + if ( !params.bodyDynamicFlags[ iBody0 ] ) + { + iBody0 = con.bodyIds[ 1 ]; + } + btAssert(params.bodyDynamicFlags[ iBody0 ]); + const btIntVec3& body0Coords = params.bodyGridCoords[iBody0]; + // for each dimension x,y,z, + for ( int i = 0; i < 3; ++i ) + { + gridCoord[ i ] = body0Coords.m_ints[ i ]; + } + } + // calculate chunk coordinates + int chunkCoord[ 3 ]; + btIntVec3 gridChunkDim = params.gridChunkDim; + // for each dimension x,y,z, + for ( int i = 0; i < 3; ++i ) + { + int coordOffset = ( iPhase >> i ) & 1; + chunkCoord[ i ] = (gridCoord[ i ] - coordOffset)/2; + btClamp( chunkCoord[ i ], 0, gridChunkDim[ i ] - 1); + btAssert( chunkCoord[ i ] < gridChunkDim[ i ] ); + } + int iBatch = iPhase * params.maxNumBatchesPerPhase + chunkCoord[ 0 ] + chunkCoord[ 1 ] * gridChunkDim[ 0 ] + chunkCoord[ 2 ] * gridChunkDim[ 0 ] * gridChunkDim[ 1 ]; + btAssert(iBatch >= 0 && iBatch < params.maxNumBatchesPerPhase*params.numPhases); + params.constraintPhaseIds[ iCon ] = iPhase; + params.constraintBatchIds[ iCon ] = iBatch; + } +} + + +struct AssignConstraintsToGridBatchesLoop : public btIParallelForBody +{ + const AssignConstraintsToGridBatchesParams* m_params; + + AssignConstraintsToGridBatchesLoop( const AssignConstraintsToGridBatchesParams& params ) + { + m_params = ¶ms; + } + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + assignConstraintsToGridBatches(*m_params, iBegin, iEnd); + } +}; + + +// +// setupSpatialGridBatchesMt -- generate batches using a uniform 3D grid +// +/* + +Bodies are treated as 3D points at their center of mass. We only consider dynamic bodies at this stage, +kinematic and static bodies are dealt with at a later stage. Also we only consider constraints that +are between 2 dynamic bodies ("dynamic" constraints) -- constraints that involve a static or kinematic body are handled later + +1. Compute a bounding box around all dynamic bodies +2. Compute the maximum extent of all dynamic constraints. Each dynamic constraint is treated as a line segment, and we need the size of + box that will fully enclose any single dynamic constraint + +3. Establish the cell size of our grid, the cell size in each dimension must be at least as large as the dynamic constraints max-extent, + so that no dynamic constraint can span more than 2 cells of our grid on any axis of the grid. The cell size should be adjusted + larger in order to keep the total number of cells from being excessively high + +Key idea: Given that each constraint spans 1 or 2 grid cells in each dimension, we can handle all dynamic constraints by processing + in chunks of 2x2x2 cells with 8 different 1-cell offsets ((0,0,0),(0,0,1),(0,1,0),(0,1,1),(1,0,0)...). + For each of the 8 offsets, we create a phase, and for each 2x2x2 chunk with dynamic constraints becomes a batch in that phase. + + Once all of the phases have been populated, if any of the phases end up with too few batches, they could possibly be merged with other phases. + + Finally, we handle all of the remaining (non-dynamic) constraints, these can be added to whichever phase is least populated to help + even things out + +*/ +// +static void setupSpatialGridBatchesMt( + btBatchedConstraints* batchedConstraints, + btAlignedObjectArray* scratchMemory, + btConstraintArray* constraints, + const btAlignedObjectArray& bodies, + int minBatchSize, + int maxBatchSize, + bool use2DGrid +) +{ + BT_PROFILE("setupSpatialGridBatchesMt"); + const int numPhases = 8; + int numConstraints = constraints->size(); + int numConstraintRows = constraints->size(); + + const int maxGridChunkCount = 128; + int allocNumBatchesPerPhase = maxGridChunkCount; + int minNumBatchesPerPhase = 16; + int allocNumBatches = allocNumBatchesPerPhase * numPhases; + + btVector3* bodyPositions = NULL; + bool* bodyDynamicFlags = NULL; + btIntVec3* bodyGridCoords = NULL; + btBatchInfo* batches = NULL; + int* batchWork = NULL; + btBatchedConstraintInfo* conInfos = NULL; + char* constraintPhaseIds = NULL; + int* constraintBatchIds = NULL; + int* constraintRowBatchIds = NULL; + { + PreallocatedMemoryHelper<10> memHelper; + memHelper.addChunk( (void**) &bodyPositions, sizeof( btVector3 ) * bodies.size() ); + memHelper.addChunk( (void**) &bodyDynamicFlags, sizeof( bool ) * bodies.size() ); + memHelper.addChunk( (void**) &bodyGridCoords, sizeof( btIntVec3 ) * bodies.size() ); + memHelper.addChunk( (void**) &batches, sizeof( btBatchInfo )* allocNumBatches ); + memHelper.addChunk( (void**) &batchWork, sizeof( int )* allocNumBatches ); + memHelper.addChunk( (void**) &conInfos, sizeof( btBatchedConstraintInfo ) * numConstraints ); + memHelper.addChunk( (void**) &constraintPhaseIds, sizeof( char ) * numConstraints ); + memHelper.addChunk( (void**) &constraintBatchIds, sizeof( int ) * numConstraints ); + memHelper.addChunk( (void**) &constraintRowBatchIds, sizeof( int ) * numConstraintRows ); + size_t scratchSize = memHelper.getSizeToAllocate(); + scratchMemory->resizeNoInitialize( scratchSize ); + char* memPtr = &scratchMemory->at(0); + memHelper.setChunkPointers( memPtr ); + } + + numConstraints = initBatchedConstraintInfo(conInfos, constraints); + + // compute bounding box around all dynamic bodies + // (could be done in parallel) + btVector3 bboxMin(BT_LARGE_FLOAT, BT_LARGE_FLOAT, BT_LARGE_FLOAT); + btVector3 bboxMax = -bboxMin; + //int dynamicBodyCount = 0; + for (int i = 0; i < bodies.size(); ++i) + { + const btSolverBody& body = bodies[i]; + btVector3 bodyPos = body.getWorldTransform().getOrigin(); + bool isDynamic = ( body.internalGetInvMass().x() > btScalar( 0 ) ); + bodyPositions[i] = bodyPos; + bodyDynamicFlags[i] = isDynamic; + if (isDynamic) + { + //dynamicBodyCount++; + bboxMin.setMin(bodyPos); + bboxMax.setMax(bodyPos); + } + } + + // find max extent of all dynamic constraints + // (could be done in parallel) + btVector3 consExtent = findMaxDynamicConstraintExtent(bodyPositions, bodyDynamicFlags, conInfos, numConstraints, bodies.size()); + + btVector3 gridExtent = bboxMax - bboxMin; + + btVector3 gridCellSize = consExtent; + int gridDim[3]; + gridDim[ 0 ] = int( 1.0 + gridExtent.x() / gridCellSize.x() ); + gridDim[ 1 ] = int( 1.0 + gridExtent.y() / gridCellSize.y() ); + gridDim[ 2 ] = int( 1.0 + gridExtent.z() / gridCellSize.z() ); + + // if we can collapse an axis, it will cut our number of phases in half which could be more efficient + int phaseMask = 7; + bool collapseAxis = use2DGrid; + if ( collapseAxis ) + { + // pick the smallest axis to collapse, leaving us with the greatest number of cells in our grid + int iAxisToCollapse = 0; + int axisDim = gridDim[iAxisToCollapse]; + //for each dimension + for ( int i = 0; i < 3; ++i ) + { + if (gridDim[i] < axisDim) + { + iAxisToCollapse = i; + axisDim = gridDim[i]; + } + } + // collapse it + gridCellSize[iAxisToCollapse] = gridExtent[iAxisToCollapse] * 2.0f; + phaseMask &= ~(1 << iAxisToCollapse); + } + + int numGridChunks = 0; + btIntVec3 gridChunkDim; // each chunk is 2x2x2 group of cells + while (true) + { + gridDim[0] = int( 1.0 + gridExtent.x() / gridCellSize.x() ); + gridDim[1] = int( 1.0 + gridExtent.y() / gridCellSize.y() ); + gridDim[2] = int( 1.0 + gridExtent.z() / gridCellSize.z() ); + gridChunkDim[ 0 ] = btMax( 1, ( gridDim[ 0 ] + 0 ) / 2 ); + gridChunkDim[ 1 ] = btMax( 1, ( gridDim[ 1 ] + 0 ) / 2 ); + gridChunkDim[ 2 ] = btMax( 1, ( gridDim[ 2 ] + 0 ) / 2 ); + numGridChunks = gridChunkDim[ 0 ] * gridChunkDim[ 1 ] * gridChunkDim[ 2 ]; + float nChunks = float(gridChunkDim[0]) * float(gridChunkDim[1]) * float(gridChunkDim[2]); // suceptible to integer overflow + if ( numGridChunks <= maxGridChunkCount && nChunks <= maxGridChunkCount ) + { + break; + } + gridCellSize *= 1.25; // should roughly cut numCells in half + } + btAssert(numGridChunks <= maxGridChunkCount ); + int maxNumBatchesPerPhase = numGridChunks; + + // for each dynamic body, compute grid coords + btVector3 invGridCellSize = btVector3(1,1,1)/gridCellSize; + // (can be done in parallel) + for (int iBody = 0; iBody < bodies.size(); ++iBody) + { + btIntVec3& coords = bodyGridCoords[iBody]; + if (bodyDynamicFlags[iBody]) + { + btVector3 v = ( bodyPositions[ iBody ] - bboxMin )*invGridCellSize; + coords.m_ints[0] = int(v.x()); + coords.m_ints[1] = int(v.y()); + coords.m_ints[2] = int(v.z()); + btAssert(coords.m_ints[0] >= 0 && coords.m_ints[0] < gridDim[0]); + btAssert(coords.m_ints[1] >= 0 && coords.m_ints[1] < gridDim[1]); + btAssert(coords.m_ints[2] >= 0 && coords.m_ints[2] < gridDim[2]); + } + else + { + coords.m_ints[0] = -1; + coords.m_ints[1] = -1; + coords.m_ints[2] = -1; + } + } + + for (int iPhase = 0; iPhase < numPhases; ++iPhase) + { + int batchBegin = iPhase * maxNumBatchesPerPhase; + int batchEnd = batchBegin + maxNumBatchesPerPhase; + for ( int iBatch = batchBegin; iBatch < batchEnd; ++iBatch ) + { + btBatchInfo& batch = batches[ iBatch ]; + batch = btBatchInfo( iPhase ); + } + } + + { + AssignConstraintsToGridBatchesParams params; + params.bodyDynamicFlags = bodyDynamicFlags; + params.bodyGridCoords = bodyGridCoords; + params.numBodies = bodies.size(); + params.conInfos = conInfos; + params.constraintPhaseIds = constraintPhaseIds; + params.constraintBatchIds = constraintBatchIds; + params.gridChunkDim = gridChunkDim; + params.maxNumBatchesPerPhase = maxNumBatchesPerPhase; + params.numPhases = numPhases; + params.phaseMask = phaseMask; + bool inParallel = true; + if (inParallel) + { + AssignConstraintsToGridBatchesLoop loop(params); + int grainSize = 500; + btParallelFor(0, numConstraints, grainSize, loop); + } + else + { + assignConstraintsToGridBatches( params, 0, numConstraints ); + } + } + for ( int iCon = 0; iCon < numConstraints; ++iCon ) + { + const btBatchedConstraintInfo& con = conInfos[ iCon ]; + int iBatch = constraintBatchIds[ iCon ]; + btBatchInfo& batch = batches[iBatch]; + batch.numConstraints += con.numConstraintRows; + } + + for (int iPhase = 0; iPhase < numPhases; ++iPhase) + { + // if phase is legit, + if (iPhase == (iPhase&phaseMask)) + { + int iBeginBatch = iPhase * maxNumBatchesPerPhase; + int iEndBatch = iBeginBatch + maxNumBatchesPerPhase; + mergeSmallBatches( batches, iBeginBatch, iEndBatch, minBatchSize, maxBatchSize ); + } + } + // all constraints have been assigned a batchId + updateConstraintBatchIdsForMergesMt(constraintBatchIds, numConstraints, batches, maxNumBatchesPerPhase*numPhases); + + if (numConstraintRows > numConstraints) + { + expandConstraintRowsMt(&constraintRowBatchIds[0], &constraintBatchIds[0], &conInfos[0], numConstraints, numConstraintRows); + } + else + { + constraintRowBatchIds = constraintBatchIds; + } + + writeOutBatches(batchedConstraints, constraintRowBatchIds, numConstraintRows, batches, batchWork, maxNumBatchesPerPhase, numPhases); + btAssert(batchedConstraints->validate(constraints, bodies)); +} + + +static void setupSingleBatch( + btBatchedConstraints* bc, + int numConstraints +) +{ + BT_PROFILE("setupSingleBatch"); + typedef btBatchedConstraints::Range Range; + + bc->m_constraintIndices.resize( numConstraints ); + for ( int i = 0; i < numConstraints; ++i ) + { + bc->m_constraintIndices[ i ] = i; + } + + bc->m_batches.resizeNoInitialize( 0 ); + bc->m_phases.resizeNoInitialize( 0 ); + bc->m_phaseOrder.resizeNoInitialize( 0 ); + bc->m_phaseGrainSize.resizeNoInitialize( 0 ); + + if (numConstraints > 0) + { + bc->m_batches.push_back( Range( 0, numConstraints ) ); + bc->m_phases.push_back( Range( 0, 1 ) ); + bc->m_phaseOrder.push_back(0); + bc->m_phaseGrainSize.push_back(1); + } +} + + +void btBatchedConstraints::setup( + btConstraintArray* constraints, + const btAlignedObjectArray& bodies, + BatchingMethod batchingMethod, + int minBatchSize, + int maxBatchSize, + btAlignedObjectArray* scratchMemory + ) +{ + if (constraints->size() >= minBatchSize*4) + { + bool use2DGrid = batchingMethod == BatchingMethod::BATCHING_METHOD_SPATIAL_GRID_2D; + setupSpatialGridBatchesMt( this, scratchMemory, constraints, bodies, minBatchSize, maxBatchSize, use2DGrid ); + if (s_debugDrawBatches) + { + debugDrawAllBatches( this, constraints, bodies ); + } + } + else + { + setupSingleBatch( this, constraints->size() ); + } +} + + diff --git a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.h b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.h new file mode 100644 index 000000000..0fd8f31dd --- /dev/null +++ b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.h @@ -0,0 +1,66 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef BT_BATCHED_CONSTRAINTS_H +#define BT_BATCHED_CONSTRAINTS_H + +#include "LinearMath/btThreads.h" +#include "LinearMath/btAlignedObjectArray.h" +#include "BulletDynamics/ConstraintSolver/btSolverBody.h" +#include "BulletDynamics/ConstraintSolver/btSolverConstraint.h" + + +class btIDebugDraw; + +struct btBatchedConstraints +{ + enum BatchingMethod + { + BATCHING_METHOD_SPATIAL_GRID_2D, + BATCHING_METHOD_SPATIAL_GRID_3D, + BATCHING_METHOD_COUNT + }; + struct Range + { + int begin; + int end; + + Range() : begin( 0 ), end( 0 ) {} + Range( int _beg, int _end ) : begin( _beg ), end( _end ) {} + }; + + btAlignedObjectArray m_constraintIndices; + btAlignedObjectArray m_batches; // each batch is a range of indices in the m_constraintIndices array + btAlignedObjectArray m_phases; // each phase is range of indices in the m_batches array + btAlignedObjectArray m_phaseGrainSize; // max grain size for each phase + btAlignedObjectArray m_phaseOrder; // phases can be done in any order, so we can randomize the order here + btIDebugDraw* m_debugDrawer; + + static bool s_debugDrawBatches; + + btBatchedConstraints() {m_debugDrawer=NULL;} + void setup( btConstraintArray* constraints, + const btAlignedObjectArray& bodies, + BatchingMethod batchingMethod, + int minBatchSize, + int maxBatchSize, + btAlignedObjectArray* scratchMemory + ); + bool validate( btConstraintArray* constraints, const btAlignedObjectArray& bodies ) const; +}; + + +#endif // BT_BATCHED_CONSTRAINTS_H + diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp index 1b2f211a1..c2a23dfb2 100644 --- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp +++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp @@ -1258,6 +1258,256 @@ void btSequentialImpulseConstraintSolver::convertContacts(btPersistentManifold** } } + +void btSequentialImpulseConstraintSolver::convertJoint(btSolverConstraint* currentConstraintRow, + btTypedConstraint* constraint, + const btTypedConstraint::btConstraintInfo1& info1, + int solverBodyIdA, + int solverBodyIdB, + const btContactSolverInfo& infoGlobal + ) +{ + const btRigidBody& rbA = constraint->getRigidBodyA(); + const btRigidBody& rbB = constraint->getRigidBodyB(); + + const btSolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA]; + const btSolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB]; + + int overrideNumSolverIterations = constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations; + if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations) + m_maxOverrideNumSolverIterations = overrideNumSolverIterations; + + for (int j=0;jgetDeltaLinearVelocity().isZero()); + btAssert(bodyAPtr->getDeltaAngularVelocity().isZero()); + btAssert(bodyAPtr->getPushVelocity().isZero()); + btAssert(bodyAPtr->getTurnVelocity().isZero()); + btAssert(bodyBPtr->getDeltaLinearVelocity().isZero()); + btAssert(bodyBPtr->getDeltaAngularVelocity().isZero()); + btAssert(bodyBPtr->getPushVelocity().isZero()); + btAssert(bodyBPtr->getTurnVelocity().isZero()); + //bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f); + //bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f); + //bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f); + //bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f); + //bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f); + //bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f); + //bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f); + //bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f); + + + btTypedConstraint::btConstraintInfo2 info2; + info2.fps = 1.f/infoGlobal.m_timeStep; + info2.erp = infoGlobal.m_erp; + info2.m_J1linearAxis = currentConstraintRow->m_contactNormal1; + info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal; + info2.m_J2linearAxis = currentConstraintRow->m_contactNormal2; + info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal; + info2.rowskip = sizeof(btSolverConstraint)/sizeof(btScalar);//check this + ///the size of btSolverConstraint needs be a multiple of btScalar + btAssert(info2.rowskip*sizeof(btScalar)== sizeof(btSolverConstraint)); + info2.m_constraintError = ¤tConstraintRow->m_rhs; + currentConstraintRow->m_cfm = infoGlobal.m_globalCfm; + info2.m_damping = infoGlobal.m_damping; + info2.cfm = ¤tConstraintRow->m_cfm; + info2.m_lowerLimit = ¤tConstraintRow->m_lowerLimit; + info2.m_upperLimit = ¤tConstraintRow->m_upperLimit; + info2.m_numIterations = infoGlobal.m_numIterations; + constraint->getInfo2(&info2); + + ///finalize the constraint setup + for (int j=0;j=constraint->getBreakingImpulseThreshold()) + { + solverConstraint.m_upperLimit = constraint->getBreakingImpulseThreshold(); + } + + if (solverConstraint.m_lowerLimit<=-constraint->getBreakingImpulseThreshold()) + { + solverConstraint.m_lowerLimit = -constraint->getBreakingImpulseThreshold(); + } + + solverConstraint.m_originalContactPoint = constraint; + + { + const btVector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal; + solverConstraint.m_angularComponentA = constraint->getRigidBodyA().getInvInertiaTensorWorld()*ftorqueAxis1*constraint->getRigidBodyA().getAngularFactor(); + } + { + const btVector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal; + solverConstraint.m_angularComponentB = constraint->getRigidBodyB().getInvInertiaTensorWorld()*ftorqueAxis2*constraint->getRigidBodyB().getAngularFactor(); + } + + { + btVector3 iMJlA = solverConstraint.m_contactNormal1*rbA.getInvMass(); + btVector3 iMJaA = rbA.getInvInertiaTensorWorld()*solverConstraint.m_relpos1CrossNormal; + btVector3 iMJlB = solverConstraint.m_contactNormal2*rbB.getInvMass();//sign of normal? + btVector3 iMJaB = rbB.getInvInertiaTensorWorld()*solverConstraint.m_relpos2CrossNormal; + + btScalar sum = iMJlA.dot(solverConstraint.m_contactNormal1); + sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal); + sum += iMJlB.dot(solverConstraint.m_contactNormal2); + sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal); + btScalar fsum = btFabs(sum); + btAssert(fsum > SIMD_EPSILON); + btScalar sorRelaxation = 1.f;//todo: get from globalInfo? + solverConstraint.m_jacDiagABInv = fsum>SIMD_EPSILON?sorRelaxation/sum : 0.f; + } + + { + btScalar rel_vel; + btVector3 externalForceImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalForceImpulse : btVector3(0,0,0); + btVector3 externalTorqueImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalTorqueImpulse : btVector3(0,0,0); + + btVector3 externalForceImpulseB = bodyBPtr->m_originalBody ? bodyBPtr->m_externalForceImpulse : btVector3(0,0,0); + btVector3 externalTorqueImpulseB = bodyBPtr->m_originalBody ?bodyBPtr->m_externalTorqueImpulse : btVector3(0,0,0); + + btScalar vel1Dotn = solverConstraint.m_contactNormal1.dot(rbA.getLinearVelocity()+externalForceImpulseA) + + solverConstraint.m_relpos1CrossNormal.dot(rbA.getAngularVelocity()+externalTorqueImpulseA); + + btScalar vel2Dotn = solverConstraint.m_contactNormal2.dot(rbB.getLinearVelocity()+externalForceImpulseB) + + solverConstraint.m_relpos2CrossNormal.dot(rbB.getAngularVelocity()+externalTorqueImpulseB); + + rel_vel = vel1Dotn+vel2Dotn; + btScalar restitution = 0.f; + btScalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2 + btScalar velocityError = restitution - rel_vel * info2.m_damping; + btScalar penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv; + btScalar velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv; + solverConstraint.m_rhs = penetrationImpulse+velocityImpulse; + solverConstraint.m_appliedImpulse = 0.f; + } + } +} + + +void btSequentialImpulseConstraintSolver::convertJoints(btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal) +{ + BT_PROFILE("convertJoints"); + for (int j=0;jbuildJacobian(); + constraint->internalSetAppliedImpulse(0.0f); + } + + int totalNumRows = 0; + + m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints); + //calculate the total number of contraint rows + for (int i=0;igetJointFeedback(); + if (fb) + { + fb->m_appliedForceBodyA.setZero(); + fb->m_appliedTorqueBodyA.setZero(); + fb->m_appliedForceBodyB.setZero(); + fb->m_appliedTorqueBodyB.setZero(); + } + + if (constraints[i]->isEnabled()) + { + constraints[i]->getInfo1(&info1); + } else + { + info1.m_numConstraintRows = 0; + info1.nub = 0; + } + totalNumRows += info1.m_numConstraintRows; + } + m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows); + + + ///setup the btSolverConstraints + int currentRow = 0; + + for (int i=0;igetRigidBodyA(); + btRigidBody& rbB = constraint->getRigidBodyB(); + + int solverBodyIdA = getOrInitSolverBody(rbA,infoGlobal.m_timeStep); + int solverBodyIdB = getOrInitSolverBody(rbB,infoGlobal.m_timeStep); + + convertJoint(currentConstraintRow, constraint, info1, solverBodyIdA, solverBodyIdB, infoGlobal); + } + currentRow+=info1.m_numConstraintRows; + } +} + + +void btSequentialImpulseConstraintSolver::convertBodies(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal) +{ + BT_PROFILE("convertBodies"); + for (int i = 0; i < numBodies; i++) + { + bodies[i]->setCompanionId(-1); + } +#if BT_THREADSAFE + m_kinematicBodyUniqueIdToSolverBodyTable.resize( 0 ); +#endif // BT_THREADSAFE + + m_tmpSolverBodyPool.reserve(numBodies+1); + m_tmpSolverBodyPool.resize(0); + + //btSolverBody& fixedBody = m_tmpSolverBodyPool.expand(); + //initSolverBody(&fixedBody,0); + + for (int i=0;igetInvMass()) + { + btSolverBody& solverBody = m_tmpSolverBodyPool[bodyId]; + btVector3 gyroForce (0,0,0); + if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_EXPLICIT) + { + gyroForce = body->computeGyroscopicForceExplicit(infoGlobal.m_maxGyroscopicForce); + solverBody.m_externalTorqueImpulse -= gyroForce*body->getInvInertiaTensorWorld()*infoGlobal.m_timeStep; + } + if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_WORLD) + { + gyroForce = body->computeGyroscopicImpulseImplicit_World(infoGlobal.m_timeStep); + solverBody.m_externalTorqueImpulse += gyroForce; + } + if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_BODY) + { + gyroForce = body->computeGyroscopicImpulseImplicit_Body(infoGlobal.m_timeStep); + solverBody.m_externalTorqueImpulse += gyroForce; + + } + } + } +} + + btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) { m_fixedBodyId = -1; @@ -1344,250 +1594,13 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol #endif //BT_ADDITIONAL_DEBUG - for (int i = 0; i < numBodies; i++) - { - bodies[i]->setCompanionId(-1); - } -#if BT_THREADSAFE - m_kinematicBodyUniqueIdToSolverBodyTable.resize( 0 ); -#endif // BT_THREADSAFE - - m_tmpSolverBodyPool.reserve(numBodies+1); - m_tmpSolverBodyPool.resize(0); - - //btSolverBody& fixedBody = m_tmpSolverBodyPool.expand(); - //initSolverBody(&fixedBody,0); - //convert all bodies + convertBodies(bodies, numBodies, infoGlobal); + convertJoints(constraints, numConstraints, infoGlobal); - for (int i=0;igetInvMass()) - { - btSolverBody& solverBody = m_tmpSolverBodyPool[bodyId]; - btVector3 gyroForce (0,0,0); - if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_EXPLICIT) - { - gyroForce = body->computeGyroscopicForceExplicit(infoGlobal.m_maxGyroscopicForce); - solverBody.m_externalTorqueImpulse -= gyroForce*body->getInvInertiaTensorWorld()*infoGlobal.m_timeStep; - } - if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_WORLD) - { - gyroForce = body->computeGyroscopicImpulseImplicit_World(infoGlobal.m_timeStep); - solverBody.m_externalTorqueImpulse += gyroForce; - } - if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_BODY) - { - gyroForce = body->computeGyroscopicImpulseImplicit_Body(infoGlobal.m_timeStep); - solverBody.m_externalTorqueImpulse += gyroForce; - - } - - - } - } - - if (1) - { - int j; - for (j=0;jbuildJacobian(); - constraint->internalSetAppliedImpulse(0.0f); - } - } - - //btRigidBody* rb0=0,*rb1=0; - - //if (1) - { - { - - int totalNumRows = 0; - int i; - - m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints); - //calculate the total number of contraint rows - for (i=0;igetJointFeedback(); - if (fb) - { - fb->m_appliedForceBodyA.setZero(); - fb->m_appliedTorqueBodyA.setZero(); - fb->m_appliedForceBodyB.setZero(); - fb->m_appliedTorqueBodyB.setZero(); - } - - if (constraints[i]->isEnabled()) - { - constraints[i]->getInfo1(&info1); - } else - { - info1.m_numConstraintRows = 0; - info1.nub = 0; - } - totalNumRows += info1.m_numConstraintRows; - } - m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows); - - - ///setup the btSolverConstraints - int currentRow = 0; - - for (i=0;igetRigidBodyA(); - btRigidBody& rbB = constraint->getRigidBodyB(); - - int solverBodyIdA = getOrInitSolverBody(rbA,infoGlobal.m_timeStep); - int solverBodyIdB = getOrInitSolverBody(rbB,infoGlobal.m_timeStep); - - btSolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA]; - btSolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB]; - - - - - int overrideNumSolverIterations = constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations; - if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations) - m_maxOverrideNumSolverIterations = overrideNumSolverIterations; - - - int j; - for ( j=0;jinternalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f); - bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f); - bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f); - bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f); - bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f); - bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f); - bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f); - bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f); - - - btTypedConstraint::btConstraintInfo2 info2; - info2.fps = 1.f/infoGlobal.m_timeStep; - info2.erp = infoGlobal.m_erp; - info2.m_J1linearAxis = currentConstraintRow->m_contactNormal1; - info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal; - info2.m_J2linearAxis = currentConstraintRow->m_contactNormal2; - info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal; - info2.rowskip = sizeof(btSolverConstraint)/sizeof(btScalar);//check this - ///the size of btSolverConstraint needs be a multiple of btScalar - btAssert(info2.rowskip*sizeof(btScalar)== sizeof(btSolverConstraint)); - info2.m_constraintError = ¤tConstraintRow->m_rhs; - currentConstraintRow->m_cfm = infoGlobal.m_globalCfm; - info2.m_damping = infoGlobal.m_damping; - info2.cfm = ¤tConstraintRow->m_cfm; - info2.m_lowerLimit = ¤tConstraintRow->m_lowerLimit; - info2.m_upperLimit = ¤tConstraintRow->m_upperLimit; - info2.m_numIterations = infoGlobal.m_numIterations; - constraints[i]->getInfo2(&info2); - - ///finalize the constraint setup - for ( j=0;j=constraints[i]->getBreakingImpulseThreshold()) - { - solverConstraint.m_upperLimit = constraints[i]->getBreakingImpulseThreshold(); - } - - if (solverConstraint.m_lowerLimit<=-constraints[i]->getBreakingImpulseThreshold()) - { - solverConstraint.m_lowerLimit = -constraints[i]->getBreakingImpulseThreshold(); - } - - solverConstraint.m_originalContactPoint = constraint; - - { - const btVector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal; - solverConstraint.m_angularComponentA = constraint->getRigidBodyA().getInvInertiaTensorWorld()*ftorqueAxis1*constraint->getRigidBodyA().getAngularFactor(); - } - { - const btVector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal; - solverConstraint.m_angularComponentB = constraint->getRigidBodyB().getInvInertiaTensorWorld()*ftorqueAxis2*constraint->getRigidBodyB().getAngularFactor(); - } - - { - btVector3 iMJlA = solverConstraint.m_contactNormal1*rbA.getInvMass(); - btVector3 iMJaA = rbA.getInvInertiaTensorWorld()*solverConstraint.m_relpos1CrossNormal; - btVector3 iMJlB = solverConstraint.m_contactNormal2*rbB.getInvMass();//sign of normal? - btVector3 iMJaB = rbB.getInvInertiaTensorWorld()*solverConstraint.m_relpos2CrossNormal; - - btScalar sum = iMJlA.dot(solverConstraint.m_contactNormal1); - sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal); - sum += iMJlB.dot(solverConstraint.m_contactNormal2); - sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal); - btScalar fsum = btFabs(sum); - btAssert(fsum > SIMD_EPSILON); - btScalar sorRelaxation = 1.f;//todo: get from globalInfo? - solverConstraint.m_jacDiagABInv = fsum>SIMD_EPSILON?sorRelaxation/sum : 0.f; - } - - - - { - btScalar rel_vel; - btVector3 externalForceImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalForceImpulse : btVector3(0,0,0); - btVector3 externalTorqueImpulseA = bodyAPtr->m_originalBody ? bodyAPtr->m_externalTorqueImpulse : btVector3(0,0,0); - - btVector3 externalForceImpulseB = bodyBPtr->m_originalBody ? bodyBPtr->m_externalForceImpulse : btVector3(0,0,0); - btVector3 externalTorqueImpulseB = bodyBPtr->m_originalBody ?bodyBPtr->m_externalTorqueImpulse : btVector3(0,0,0); - - btScalar vel1Dotn = solverConstraint.m_contactNormal1.dot(rbA.getLinearVelocity()+externalForceImpulseA) - + solverConstraint.m_relpos1CrossNormal.dot(rbA.getAngularVelocity()+externalTorqueImpulseA); - - btScalar vel2Dotn = solverConstraint.m_contactNormal2.dot(rbB.getLinearVelocity()+externalForceImpulseB) - + solverConstraint.m_relpos2CrossNormal.dot(rbB.getAngularVelocity()+externalTorqueImpulseB); - - rel_vel = vel1Dotn+vel2Dotn; - btScalar restitution = 0.f; - btScalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2 - btScalar velocityError = restitution - rel_vel * info2.m_damping; - btScalar penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv; - btScalar velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv; - solverConstraint.m_rhs = penetrationImpulse+velocityImpulse; - solverConstraint.m_appliedImpulse = 0.f; - - - } - } - } - currentRow+=m_tmpConstraintSizesPool[i].m_numConstraintRows; - } - } - - convertContacts(manifoldPtr,numManifolds,infoGlobal); - - } // btContactSolverInfo info = infoGlobal; @@ -1627,6 +1640,7 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup(btCol btScalar btSequentialImpulseConstraintSolver::solveSingleIteration(int iteration, btCollisionObject** /*bodies */,int /*numBodies*/,btPersistentManifold** /*manifoldPtr*/, int /*numManifolds*/,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* /*debugDrawer*/) { + BT_PROFILE("solveSingleIteration"); btScalar leastSquaresResidual = 0.f; int numNonContactPool = m_tmpSolverNonContactConstraintPool.size(); @@ -1805,6 +1819,7 @@ btScalar btSequentialImpulseConstraintSolver::solveSingleIteration(int iteration void btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) { + BT_PROFILE("solveGroupCacheFriendlySplitImpulseIterations"); int iteration; if (infoGlobal.m_splitImpulse) { @@ -1863,14 +1878,9 @@ btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyIterations( return 0.f; } -btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal) +void btSequentialImpulseConstraintSolver::writeBackContacts(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal) { - int numPoolConstraints = m_tmpSolverContactConstraintPool.size(); - int i,j; - - if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING) - { - for (j=0;jsetEnabled(false); } } +} - - for ( i=0;isetCompanionId(-1); } } +} + +btScalar btSequentialImpulseConstraintSolver::solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal) +{ + BT_PROFILE("solveGroupCacheFriendlyFinish"); + + if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING) + { + writeBackContacts(0, m_tmpSolverContactConstraintPool.size(), infoGlobal); + } + + writeBackJoints(0, m_tmpSolverNonContactConstraintPool.size(), infoGlobal); + writeBackBodies(0, m_tmpSolverBodyPool.size(), infoGlobal); m_tmpSolverContactConstraintPool.resizeNoInitialize(0); m_tmpSolverNonContactConstraintPool.resizeNoInitialize(0); diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h index 16c7eb74c..8c9c67f85 100644 --- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h +++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h @@ -95,6 +95,10 @@ protected: void convertContact(btPersistentManifold* manifold,const btContactSolverInfo& infoGlobal); + virtual void convertJoints(btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal); + void convertJoint(btSolverConstraint* destConstraintRow, btTypedConstraint* srcConstraint, const btTypedConstraint::btConstraintInfo1& info1, int solverBodyIdA, int solverBodyIdB, const btContactSolverInfo& infoGlobal); + + virtual void convertBodies(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal); btSimdScalar resolveSplitPenetrationSIMD(btSolverBody& bodyA,btSolverBody& bodyB, const btSolverConstraint& contactConstraint) { @@ -121,7 +125,9 @@ protected: protected: - + void writeBackContacts(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal); + void writeBackJoints(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal); + void writeBackBodies(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal); virtual void solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer); virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies,int numBodies,const btContactSolverInfo& infoGlobal); virtual btScalar solveSingleIteration(int iteration, btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer); diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp new file mode 100644 index 000000000..b09665b15 --- /dev/null +++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp @@ -0,0 +1,1611 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + +#include "btSequentialImpulseConstraintSolverMt.h" + +#include "LinearMath/btQuickprof.h" + +#include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h" + +#include "BulletDynamics/ConstraintSolver/btTypedConstraint.h" +#include "BulletDynamics/Dynamics/btRigidBody.h" + + + +bool btSequentialImpulseConstraintSolverMt::s_allowNestedParallelForLoops = false; // some task schedulers don't like nested loops +int btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching = 250; +int btSequentialImpulseConstraintSolverMt::s_minBatchSize = 50; +int btSequentialImpulseConstraintSolverMt::s_maxBatchSize = 100; +btBatchedConstraints::BatchingMethod btSequentialImpulseConstraintSolverMt::s_contactBatchingMethod = btBatchedConstraints::BATCHING_METHOD_SPATIAL_GRID_2D; +btBatchedConstraints::BatchingMethod btSequentialImpulseConstraintSolverMt::s_jointBatchingMethod = btBatchedConstraints::BATCHING_METHOD_SPATIAL_GRID_2D; + + +btSequentialImpulseConstraintSolverMt::btSequentialImpulseConstraintSolverMt() +{ + m_numFrictionDirections = 1; + m_useBatching = false; + m_useObsoleteJointConstraints = false; +} + + +btSequentialImpulseConstraintSolverMt::~btSequentialImpulseConstraintSolverMt() +{ +} + + +void btSequentialImpulseConstraintSolverMt::setupBatchedContactConstraints() +{ + BT_PROFILE("setupBatchedContactConstraints"); + m_batchedContactConstraints.setup( &m_tmpSolverContactConstraintPool, + m_tmpSolverBodyPool, + s_contactBatchingMethod, + s_minBatchSize, + s_maxBatchSize, + &m_scratchMemory + ); +} + + +void btSequentialImpulseConstraintSolverMt::setupBatchedJointConstraints() +{ + BT_PROFILE("setupBatchedJointConstraints"); + m_batchedJointConstraints.setup( &m_tmpSolverNonContactConstraintPool, + m_tmpSolverBodyPool, + s_jointBatchingMethod, + s_minBatchSize, + s_maxBatchSize, + &m_scratchMemory + ); +} + + +void btSequentialImpulseConstraintSolverMt::internalSetupContactConstraints(int iContactConstraint, const btContactSolverInfo& infoGlobal) +{ + btSolverConstraint& contactConstraint = m_tmpSolverContactConstraintPool[iContactConstraint]; + + btVector3 rel_pos1; + btVector3 rel_pos2; + btScalar relaxation; + + int solverBodyIdA = contactConstraint.m_solverBodyIdA; + int solverBodyIdB = contactConstraint.m_solverBodyIdB; + + btSolverBody* solverBodyA = &m_tmpSolverBodyPool[ solverBodyIdA ]; + btSolverBody* solverBodyB = &m_tmpSolverBodyPool[ solverBodyIdB ]; + + btRigidBody* colObj0 = solverBodyA->m_originalBody; + btRigidBody* colObj1 = solverBodyB->m_originalBody; + + btManifoldPoint& cp = *static_cast( contactConstraint.m_originalContactPoint ); + + const btVector3& pos1 = cp.getPositionWorldOnA(); + const btVector3& pos2 = cp.getPositionWorldOnB(); + + rel_pos1 = pos1 - solverBodyA->getWorldTransform().getOrigin(); + rel_pos2 = pos2 - solverBodyB->getWorldTransform().getOrigin(); + + btVector3 vel1; + btVector3 vel2; + + solverBodyA->getVelocityInLocalPointNoDelta( rel_pos1, vel1 ); + solverBodyB->getVelocityInLocalPointNoDelta( rel_pos2, vel2 ); + + btVector3 vel = vel1 - vel2; + btScalar rel_vel = cp.m_normalWorldOnB.dot( vel ); + + setupContactConstraint( contactConstraint, solverBodyIdA, solverBodyIdB, cp, infoGlobal, relaxation, rel_pos1, rel_pos2 ); + + // setup rolling friction constraints + int rollingFrictionIndex = m_rollingFrictionIndexTable[iContactConstraint]; + if (rollingFrictionIndex >= 0) + { + btSolverConstraint& spinningFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[ rollingFrictionIndex ]; + btAssert( spinningFrictionConstraint.m_frictionIndex == iContactConstraint ); + setupTorsionalFrictionConstraint( spinningFrictionConstraint, + cp.m_normalWorldOnB, + solverBodyIdA, + solverBodyIdB, + cp, + cp.m_combinedSpinningFriction, + rel_pos1, + rel_pos2, + colObj0, + colObj1, + relaxation, + 0.0f, + 0.0f + ); + btVector3 axis[2]; + btPlaneSpace1( cp.m_normalWorldOnB, axis[0], axis[1] ); + axis[0].normalize(); + axis[1].normalize(); + + applyAnisotropicFriction( colObj0, axis[0], btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION ); + applyAnisotropicFriction( colObj1, axis[0], btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION ); + applyAnisotropicFriction( colObj0, axis[1], btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION ); + applyAnisotropicFriction( colObj1, axis[1], btCollisionObject::CF_ANISOTROPIC_ROLLING_FRICTION ); + // put the largest axis first + if (axis[1].length2() > axis[0].length2()) + { + btSwap(axis[0], axis[1]); + } + const btScalar kRollingFrictionThreshold = 0.001f; + for (int i = 0; i < 2; ++i) + { + int iRollingFric = rollingFrictionIndex + 1 + i; + btSolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[ iRollingFric ]; + btAssert(rollingFrictionConstraint.m_frictionIndex == iContactConstraint); + btVector3 dir = axis[i]; + if ( dir.length() > kRollingFrictionThreshold ) + { + setupTorsionalFrictionConstraint( rollingFrictionConstraint, + dir, + solverBodyIdA, + solverBodyIdB, + cp, + cp.m_combinedRollingFriction, + rel_pos1, + rel_pos2, + colObj0, + colObj1, + relaxation, + 0.0f, + 0.0f + ); + } + else + { + rollingFrictionConstraint.m_frictionIndex = -1; // disable constraint + } + } + } + + // setup friction constraints + // setupFrictionConstraint(solverConstraint, normalAxis, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, infoGlobal, desiredVelocity, cfmSlip); + { + ///Bullet has several options to set the friction directions + ///By default, each contact has only a single friction direction that is recomputed automatically very frame + ///based on the relative linear velocity. + ///If the relative velocity it zero, it will automatically compute a friction direction. + + ///You can also enable two friction directions, using the SOLVER_USE_2_FRICTION_DIRECTIONS. + ///In that case, the second friction direction will be orthogonal to both contact normal and first friction direction. + /// + ///If you choose SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION, then the friction will be independent from the relative projected velocity. + /// + ///The user can manually override the friction directions for certain contacts using a contact callback, + ///and set the cp.m_lateralFrictionInitialized to true + ///In that case, you can set the target relative motion in each friction direction (cp.m_contactMotion1 and cp.m_contactMotion2) + ///this will give a conveyor belt effect + /// + btSolverConstraint* frictionConstraint1 = &m_tmpSolverContactFrictionConstraintPool[contactConstraint.m_frictionIndex]; + btAssert(frictionConstraint1->m_frictionIndex == iContactConstraint); + + btSolverConstraint* frictionConstraint2 = NULL; + if ( infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS ) + { + frictionConstraint2 = &m_tmpSolverContactFrictionConstraintPool[contactConstraint.m_frictionIndex + 1]; + btAssert( frictionConstraint2->m_frictionIndex == iContactConstraint ); + } + + if ( !( infoGlobal.m_solverMode & SOLVER_ENABLE_FRICTION_DIRECTION_CACHING ) || !( cp.m_contactPointFlags&BT_CONTACT_FLAG_LATERAL_FRICTION_INITIALIZED ) ) + { + cp.m_lateralFrictionDir1 = vel - cp.m_normalWorldOnB * rel_vel; + btScalar lat_rel_vel = cp.m_lateralFrictionDir1.length2(); + if ( !( infoGlobal.m_solverMode & SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION ) && lat_rel_vel > SIMD_EPSILON ) + { + cp.m_lateralFrictionDir1 *= 1.f / btSqrt( lat_rel_vel ); + applyAnisotropicFriction( colObj0, cp.m_lateralFrictionDir1, btCollisionObject::CF_ANISOTROPIC_FRICTION ); + applyAnisotropicFriction( colObj1, cp.m_lateralFrictionDir1, btCollisionObject::CF_ANISOTROPIC_FRICTION ); + setupFrictionConstraint( *frictionConstraint1, cp.m_lateralFrictionDir1, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, infoGlobal ); + + if ( frictionConstraint2 ) + { + cp.m_lateralFrictionDir2 = cp.m_lateralFrictionDir1.cross( cp.m_normalWorldOnB ); + cp.m_lateralFrictionDir2.normalize();//?? + applyAnisotropicFriction( colObj0, cp.m_lateralFrictionDir2, btCollisionObject::CF_ANISOTROPIC_FRICTION ); + applyAnisotropicFriction( colObj1, cp.m_lateralFrictionDir2, btCollisionObject::CF_ANISOTROPIC_FRICTION ); + setupFrictionConstraint( *frictionConstraint2, cp.m_lateralFrictionDir2, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, infoGlobal ); + } + } + else + { + btPlaneSpace1( cp.m_normalWorldOnB, cp.m_lateralFrictionDir1, cp.m_lateralFrictionDir2 ); + + applyAnisotropicFriction( colObj0, cp.m_lateralFrictionDir1, btCollisionObject::CF_ANISOTROPIC_FRICTION ); + applyAnisotropicFriction( colObj1, cp.m_lateralFrictionDir1, btCollisionObject::CF_ANISOTROPIC_FRICTION ); + setupFrictionConstraint( *frictionConstraint1, cp.m_lateralFrictionDir1, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, infoGlobal ); + + if ( frictionConstraint2 ) + { + applyAnisotropicFriction( colObj0, cp.m_lateralFrictionDir2, btCollisionObject::CF_ANISOTROPIC_FRICTION ); + applyAnisotropicFriction( colObj1, cp.m_lateralFrictionDir2, btCollisionObject::CF_ANISOTROPIC_FRICTION ); + setupFrictionConstraint( *frictionConstraint2, cp.m_lateralFrictionDir2, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, infoGlobal ); + } + + if ( ( infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS ) && ( infoGlobal.m_solverMode & SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION ) ) + { + cp.m_contactPointFlags |= BT_CONTACT_FLAG_LATERAL_FRICTION_INITIALIZED; + } + } + } + else + { + setupFrictionConstraint( *frictionConstraint1, cp.m_lateralFrictionDir1, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, infoGlobal, cp.m_contactMotion1, cp.m_frictionCFM ); + if ( frictionConstraint2 ) + { + setupFrictionConstraint( *frictionConstraint2, cp.m_lateralFrictionDir2, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, infoGlobal, cp.m_contactMotion2, cp.m_frictionCFM ); + } + } + } + + setFrictionConstraintImpulse( contactConstraint, solverBodyIdA, solverBodyIdB, cp, infoGlobal ); +} + + +struct SetupContactConstraintsLoop : public btIParallelForBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + const btBatchedConstraints* m_bc; + const btContactSolverInfo* m_infoGlobal; + + SetupContactConstraintsLoop( btSequentialImpulseConstraintSolverMt* solver, const btBatchedConstraints* bc, const btContactSolverInfo& infoGlobal ) + { + m_solver = solver; + m_bc = bc; + m_infoGlobal = &infoGlobal; + } + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + BT_PROFILE( "SetupContactConstraintsLoop" ); + for ( int iBatch = iBegin; iBatch < iEnd; ++iBatch ) + { + const btBatchedConstraints::Range& batch = m_bc->m_batches[ iBatch ]; + for (int i = batch.begin; i < batch.end; ++i) + { + int iContact = m_bc->m_constraintIndices[i]; + m_solver->internalSetupContactConstraints( iContact, *m_infoGlobal ); + } + } + } +}; + + +void btSequentialImpulseConstraintSolverMt::setupAllContactConstraints(const btContactSolverInfo& infoGlobal) +{ + BT_PROFILE( "setupAllContactConstraints" ); + if ( m_useBatching ) + { + const btBatchedConstraints& batchedCons = m_batchedContactConstraints; + SetupContactConstraintsLoop loop( this, &batchedCons, infoGlobal ); + for ( int iiPhase = 0; iiPhase < batchedCons.m_phases.size(); ++iiPhase ) + { + int iPhase = batchedCons.m_phaseOrder[ iiPhase ]; + const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ]; + int grainSize = 1; + btParallelFor( phase.begin, phase.end, grainSize, loop ); + } + } + else + { + for ( int i = 0; i < m_tmpSolverContactConstraintPool.size(); ++i ) + { + internalSetupContactConstraints( i, infoGlobal ); + } + } +} + + +int btSequentialImpulseConstraintSolverMt::getOrInitSolverBodyThreadsafe(btCollisionObject& body,btScalar timeStep) +{ + // + // getOrInitSolverBody is threadsafe only for a single thread per solver (with potentially multiple solvers) + // + // getOrInitSolverBodyThreadsafe -- attempts to be fully threadsafe (however may affect determinism) + // + int solverBodyId = -1; + if ( !body.isStaticOrKinematicObject() ) + { + // dynamic body + // Dynamic bodies can only be in one island, so it's safe to write to the companionId + solverBodyId = body.getCompanionId(); + if ( solverBodyId < 0 ) + { + m_bodySolverArrayMutex.lock(); + // now that we have the lock, check again + solverBodyId = body.getCompanionId(); + if ( solverBodyId < 0 ) + { + if ( btRigidBody* rb = btRigidBody::upcast( &body ) ) + { + solverBodyId = m_tmpSolverBodyPool.size(); + btSolverBody& solverBody = m_tmpSolverBodyPool.expand(); + initSolverBody( &solverBody, &body, timeStep ); + body.setCompanionId( solverBodyId ); + } + } + m_bodySolverArrayMutex.unlock(); + } + } + else if (body.isKinematicObject()) + { + // + // NOTE: must test for kinematic before static because some kinematic objects also + // identify as "static" + // + // Kinematic bodies can be in multiple islands at once, so it is a + // race condition to write to them, so we use an alternate method + // to record the solverBodyId + int uniqueId = body.getWorldArrayIndex(); + const int INVALID_SOLVER_BODY_ID = -1; + if (m_kinematicBodyUniqueIdToSolverBodyTable.size() <= uniqueId ) + { + m_kinematicBodyUniqueIdToSolverBodyTableMutex.lock(); + // now that we have the lock, check again + if ( m_kinematicBodyUniqueIdToSolverBodyTable.size() <= uniqueId ) + { + m_kinematicBodyUniqueIdToSolverBodyTable.resize( uniqueId + 1, INVALID_SOLVER_BODY_ID ); + } + m_kinematicBodyUniqueIdToSolverBodyTableMutex.unlock(); + } + solverBodyId = m_kinematicBodyUniqueIdToSolverBodyTable[ uniqueId ]; + // if no table entry yet, + if ( INVALID_SOLVER_BODY_ID == solverBodyId ) + { + // need to acquire both locks + m_kinematicBodyUniqueIdToSolverBodyTableMutex.lock(); + m_bodySolverArrayMutex.lock(); + // now that we have the lock, check again + solverBodyId = m_kinematicBodyUniqueIdToSolverBodyTable[ uniqueId ]; + if ( INVALID_SOLVER_BODY_ID == solverBodyId ) + { + // create a table entry for this body + btRigidBody* rb = btRigidBody::upcast( &body ); + solverBodyId = m_tmpSolverBodyPool.size(); + btSolverBody& solverBody = m_tmpSolverBodyPool.expand(); + initSolverBody( &solverBody, &body, timeStep ); + m_kinematicBodyUniqueIdToSolverBodyTable[ uniqueId ] = solverBodyId; + } + m_bodySolverArrayMutex.unlock(); + m_kinematicBodyUniqueIdToSolverBodyTableMutex.unlock(); + } + } + else + { + // all fixed bodies (inf mass) get mapped to a single solver id + if ( m_fixedBodyId < 0 ) + { + m_bodySolverArrayMutex.lock(); + // now that we have the lock, check again + if ( m_fixedBodyId < 0 ) + { + m_fixedBodyId = m_tmpSolverBodyPool.size(); + btSolverBody& fixedBody = m_tmpSolverBodyPool.expand(); + initSolverBody( &fixedBody, 0, timeStep ); + } + m_bodySolverArrayMutex.unlock(); + } + solverBodyId = m_fixedBodyId; + } + btAssert( solverBodyId < m_tmpSolverBodyPool.size() ); + return solverBodyId; +} + + +void btSequentialImpulseConstraintSolverMt::internalCollectContactManifoldCachedInfo(btContactManifoldCachedInfo* cachedInfoArray, btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal) +{ + BT_PROFILE("internalCollectContactManifoldCachedInfo"); + for (int i = 0; i < numManifolds; ++i) + { + btContactManifoldCachedInfo* cachedInfo = &cachedInfoArray[i]; + btPersistentManifold* manifold = manifoldPtr[i]; + btCollisionObject* colObj0 = (btCollisionObject*) manifold->getBody0(); + btCollisionObject* colObj1 = (btCollisionObject*) manifold->getBody1(); + + int solverBodyIdA = getOrInitSolverBodyThreadsafe( *colObj0, infoGlobal.m_timeStep ); + int solverBodyIdB = getOrInitSolverBodyThreadsafe( *colObj1, infoGlobal.m_timeStep ); + + cachedInfo->solverBodyIds[ 0 ] = solverBodyIdA; + cachedInfo->solverBodyIds[ 1 ] = solverBodyIdB; + cachedInfo->numTouchingContacts = 0; + + btSolverBody* solverBodyA = &m_tmpSolverBodyPool[ solverBodyIdA ]; + btSolverBody* solverBodyB = &m_tmpSolverBodyPool[ solverBodyIdB ]; + + ///avoid collision response between two static objects + if ( solverBodyA->m_invMass.fuzzyZero() && solverBodyB->m_invMass.fuzzyZero() ) + break; + + int iContact = 0; + for ( int j = 0; j < manifold->getNumContacts(); j++ ) + { + btManifoldPoint& cp = manifold->getContactPoint( j ); + + if ( cp.getDistance() <= manifold->getContactProcessingThreshold() ) + { + cachedInfo->contactPoints[ iContact ] = &cp; + cachedInfo->contactHasRollingFriction[ iContact ] = ( cp.m_combinedRollingFriction > 0.f ); + iContact++; + } + } + cachedInfo->numTouchingContacts = iContact; + } +} + + +struct CollectContactManifoldCachedInfoLoop : public btIParallelForBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + btSequentialImpulseConstraintSolverMt::btContactManifoldCachedInfo* m_cachedInfoArray; + btPersistentManifold** m_manifoldPtr; + const btContactSolverInfo* m_infoGlobal; + + CollectContactManifoldCachedInfoLoop( btSequentialImpulseConstraintSolverMt* solver, btSequentialImpulseConstraintSolverMt::btContactManifoldCachedInfo* cachedInfoArray, btPersistentManifold** manifoldPtr, const btContactSolverInfo& infoGlobal ) + { + m_solver = solver; + m_cachedInfoArray = cachedInfoArray; + m_manifoldPtr = manifoldPtr; + m_infoGlobal = &infoGlobal; + } + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + m_solver->internalCollectContactManifoldCachedInfo( m_cachedInfoArray + iBegin, m_manifoldPtr + iBegin, iEnd - iBegin, *m_infoGlobal ); + } +}; + + +void btSequentialImpulseConstraintSolverMt::internalAllocContactConstraints(const btContactManifoldCachedInfo* cachedInfoArray, int numManifolds) +{ + BT_PROFILE("internalAllocContactConstraints"); + // possibly parallel part + for ( int iManifold = 0; iManifold < numManifolds; ++iManifold ) + { + const btContactManifoldCachedInfo& cachedInfo = cachedInfoArray[ iManifold ]; + int contactIndex = cachedInfo.contactIndex; + int frictionIndex = contactIndex * m_numFrictionDirections; + int rollingFrictionIndex = cachedInfo.rollingFrictionIndex; + for ( int i = 0; i < cachedInfo.numTouchingContacts; i++ ) + { + btSolverConstraint& contactConstraint = m_tmpSolverContactConstraintPool[contactIndex]; + contactConstraint.m_solverBodyIdA = cachedInfo.solverBodyIds[ 0 ]; + contactConstraint.m_solverBodyIdB = cachedInfo.solverBodyIds[ 1 ]; + contactConstraint.m_originalContactPoint = cachedInfo.contactPoints[ i ]; + + // allocate the friction constraints + contactConstraint.m_frictionIndex = frictionIndex; + for ( int iDir = 0; iDir < m_numFrictionDirections; ++iDir ) + { + btSolverConstraint& frictionConstraint = m_tmpSolverContactFrictionConstraintPool[frictionIndex]; + frictionConstraint.m_frictionIndex = contactIndex; + frictionIndex++; + } + + // allocate rolling friction constraints + if ( cachedInfo.contactHasRollingFriction[ i ] ) + { + m_rollingFrictionIndexTable[ contactIndex ] = rollingFrictionIndex; + // allocate 3 (although we may use only 2 sometimes) + for ( int i = 0; i < 3; i++ ) + { + m_tmpSolverContactRollingFrictionConstraintPool[ rollingFrictionIndex ].m_frictionIndex = contactIndex; + rollingFrictionIndex++; + } + } + else + { + // indicate there is no rolling friction for this contact point + m_rollingFrictionIndexTable[ contactIndex ] = -1; + } + contactIndex++; + } + } +} + + +struct AllocContactConstraintsLoop : public btIParallelForBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + const btSequentialImpulseConstraintSolverMt::btContactManifoldCachedInfo* m_cachedInfoArray; + + AllocContactConstraintsLoop( btSequentialImpulseConstraintSolverMt* solver, btSequentialImpulseConstraintSolverMt::btContactManifoldCachedInfo* cachedInfoArray ) + { + m_solver = solver; + m_cachedInfoArray = cachedInfoArray; + } + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + m_solver->internalAllocContactConstraints( m_cachedInfoArray + iBegin, iEnd - iBegin ); + } +}; + + +void btSequentialImpulseConstraintSolverMt::allocAllContactConstraints(btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal) +{ + BT_PROFILE( "allocAllContactConstraints" ); + btAlignedObjectArray cachedInfoArray; // = m_manifoldCachedInfoArray; + cachedInfoArray.resizeNoInitialize( numManifolds ); + if (false) + { + // sequential + internalCollectContactManifoldCachedInfo(&cachedInfoArray[ 0 ], manifoldPtr, numManifolds, infoGlobal); + } + else + { + // may alter ordering of bodies which affects determinism + CollectContactManifoldCachedInfoLoop loop( this, &cachedInfoArray[ 0 ], manifoldPtr, infoGlobal ); + int grainSize = 200; + btParallelFor( 0, numManifolds, grainSize, loop ); + } + + { + // serial part + int numContacts = 0; + int numRollingFrictionConstraints = 0; + for ( int iManifold = 0; iManifold < numManifolds; ++iManifold ) + { + btContactManifoldCachedInfo& cachedInfo = cachedInfoArray[ iManifold ]; + cachedInfo.contactIndex = numContacts; + cachedInfo.rollingFrictionIndex = numRollingFrictionConstraints; + numContacts += cachedInfo.numTouchingContacts; + for (int i = 0; i < cachedInfo.numTouchingContacts; ++i) + { + if (cachedInfo.contactHasRollingFriction[i]) + { + numRollingFrictionConstraints += 3; + } + } + } + m_tmpSolverContactConstraintPool.resizeNoInitialize(numContacts); + m_rollingFrictionIndexTable.resizeNoInitialize(numContacts); + m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize(numContacts*m_numFrictionDirections); + m_tmpSolverContactRollingFrictionConstraintPool.resizeNoInitialize(numRollingFrictionConstraints); + } + { + AllocContactConstraintsLoop loop(this, &cachedInfoArray[0]); + int grainSize = 200; + btParallelFor( 0, numManifolds, grainSize, loop ); + } +} + + +void btSequentialImpulseConstraintSolverMt::convertContacts(btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal) +{ + if (!m_useBatching) + { + btSequentialImpulseConstraintSolver::convertContacts(manifoldPtr, numManifolds, infoGlobal); + return; + } + BT_PROFILE( "convertContacts" ); + if (numManifolds > 0) + { + if ( m_fixedBodyId < 0 ) + { + m_fixedBodyId = m_tmpSolverBodyPool.size(); + btSolverBody& fixedBody = m_tmpSolverBodyPool.expand(); + initSolverBody( &fixedBody, 0, infoGlobal.m_timeStep ); + } + allocAllContactConstraints( manifoldPtr, numManifolds, infoGlobal ); + if ( m_useBatching ) + { + setupBatchedContactConstraints(); + } + setupAllContactConstraints( infoGlobal ); + } +} + + +void btSequentialImpulseConstraintSolverMt::internalInitMultipleJoints( btTypedConstraint** constraints, int iBegin, int iEnd ) +{ + BT_PROFILE("internalInitMultipleJoints"); + for ( int i = iBegin; i < iEnd; i++ ) + { + btTypedConstraint* constraint = constraints[i]; + btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i]; + if (constraint->isEnabled()) + { + constraint->buildJacobian(); + constraint->internalSetAppliedImpulse( 0.0f ); + btJointFeedback* fb = constraint->getJointFeedback(); + if ( fb ) + { + fb->m_appliedForceBodyA.setZero(); + fb->m_appliedTorqueBodyA.setZero(); + fb->m_appliedForceBodyB.setZero(); + fb->m_appliedTorqueBodyB.setZero(); + } + constraint->getInfo1( &info1 ); + } + else + { + info1.m_numConstraintRows = 0; + info1.nub = 0; + } + } +} + + +struct InitJointsLoop : public btIParallelForBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + btTypedConstraint** m_constraints; + + InitJointsLoop( btSequentialImpulseConstraintSolverMt* solver, btTypedConstraint** constraints ) + { + m_solver = solver; + m_constraints = constraints; + } + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + m_solver->internalInitMultipleJoints( m_constraints, iBegin, iEnd ); + } +}; + + +void btSequentialImpulseConstraintSolverMt::internalConvertMultipleJoints( const btAlignedObjectArray& jointParamsArray, btTypedConstraint** constraints, int iBegin, int iEnd, const btContactSolverInfo& infoGlobal ) +{ + BT_PROFILE("internalConvertMultipleJoints"); + for ( int i = iBegin; i < iEnd; ++i ) + { + const JointParams& jointParams = jointParamsArray[ i ]; + int currentRow = jointParams.m_solverConstraint; + if ( currentRow != -1 ) + { + const btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[ i ]; + btAssert( currentRow < m_tmpSolverNonContactConstraintPool.size() ); + btAssert( info1.m_numConstraintRows > 0 ); + + btSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[ currentRow ]; + btTypedConstraint* constraint = constraints[ i ]; + + convertJoint( currentConstraintRow, constraint, info1, jointParams.m_solverBodyA, jointParams.m_solverBodyB, infoGlobal ); + } + } +} + + +struct ConvertJointsLoop : public btIParallelForBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + const btAlignedObjectArray& m_jointParamsArray; + btTypedConstraint** m_srcConstraints; + const btContactSolverInfo& m_infoGlobal; + + ConvertJointsLoop( btSequentialImpulseConstraintSolverMt* solver, + const btAlignedObjectArray& jointParamsArray, + btTypedConstraint** srcConstraints, + const btContactSolverInfo& infoGlobal + ) : + m_jointParamsArray(jointParamsArray), + m_infoGlobal(infoGlobal) + { + m_solver = solver; + m_srcConstraints = srcConstraints; + } + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + m_solver->internalConvertMultipleJoints( m_jointParamsArray, m_srcConstraints, iBegin, iEnd, m_infoGlobal ); + } +}; + + +void btSequentialImpulseConstraintSolverMt::convertJoints(btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& infoGlobal) +{ + if ( !m_useBatching ) + { + btSequentialImpulseConstraintSolver::convertJoints(constraints, numConstraints, infoGlobal); + return; + } + BT_PROFILE("convertJoints"); + bool parallelJointSetup = true; + m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints); + if (parallelJointSetup) + { + InitJointsLoop loop(this, constraints); + int grainSize = 40; + btParallelFor(0, numConstraints, grainSize, loop); + } + else + { + internalInitMultipleJoints( constraints, 0, numConstraints ); + } + + int totalNumRows = 0; + btAlignedObjectArray jointParamsArray; + jointParamsArray.resizeNoInitialize(numConstraints); + + //calculate the total number of contraint rows + for (int i=0;igetRigidBodyA(), infoGlobal.m_timeStep ); + params.m_solverBodyB = getOrInitSolverBody( constraint->getRigidBodyB(), infoGlobal.m_timeStep ); + } + else + { + params.m_solverConstraint = -1; + } + totalNumRows += info1.m_numConstraintRows; + } + m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows); + + ///setup the btSolverConstraints + if ( parallelJointSetup ) + { + ConvertJointsLoop loop(this, jointParamsArray, constraints, infoGlobal); + int grainSize = 20; + btParallelFor(0, numConstraints, grainSize, loop); + } + else + { + internalConvertMultipleJoints( jointParamsArray, constraints, 0, numConstraints, infoGlobal ); + } + setupBatchedJointConstraints(); +} + + +void btSequentialImpulseConstraintSolverMt::internalConvertBodies(btCollisionObject** bodies, int iBegin, int iEnd, const btContactSolverInfo& infoGlobal) +{ + BT_PROFILE("internalConvertBodies"); + for (int i=iBegin; i < iEnd; i++) + { + btCollisionObject* obj = bodies[i]; + obj->setCompanionId(i); + btSolverBody& solverBody = m_tmpSolverBodyPool[i]; + initSolverBody(&solverBody, obj, infoGlobal.m_timeStep); + + btRigidBody* body = btRigidBody::upcast(obj); + if (body && body->getInvMass()) + { + btVector3 gyroForce (0,0,0); + if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_EXPLICIT) + { + gyroForce = body->computeGyroscopicForceExplicit(infoGlobal.m_maxGyroscopicForce); + solverBody.m_externalTorqueImpulse -= gyroForce*body->getInvInertiaTensorWorld()*infoGlobal.m_timeStep; + } + if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_WORLD) + { + gyroForce = body->computeGyroscopicImpulseImplicit_World(infoGlobal.m_timeStep); + solverBody.m_externalTorqueImpulse += gyroForce; + } + if (body->getFlags()&BT_ENABLE_GYROSCOPIC_FORCE_IMPLICIT_BODY) + { + gyroForce = body->computeGyroscopicImpulseImplicit_Body(infoGlobal.m_timeStep); + solverBody.m_externalTorqueImpulse += gyroForce; + } + } + } +} + + +struct ConvertBodiesLoop : public btIParallelForBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + btCollisionObject** m_bodies; + int m_numBodies; + const btContactSolverInfo& m_infoGlobal; + + ConvertBodiesLoop( btSequentialImpulseConstraintSolverMt* solver, + btCollisionObject** bodies, + int numBodies, + const btContactSolverInfo& infoGlobal + ) : + m_infoGlobal(infoGlobal) + { + m_solver = solver; + m_bodies = bodies; + m_numBodies = numBodies; + } + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + m_solver->internalConvertBodies( m_bodies, iBegin, iEnd, m_infoGlobal ); + } +}; + + +void btSequentialImpulseConstraintSolverMt::convertBodies(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal) +{ + BT_PROFILE("convertBodies"); + m_kinematicBodyUniqueIdToSolverBodyTable.resize( 0 ); + + m_tmpSolverBodyPool.resizeNoInitialize(numBodies+1); + + m_fixedBodyId = numBodies; + { + btSolverBody& fixedBody = m_tmpSolverBodyPool[ m_fixedBodyId ]; + initSolverBody( &fixedBody, NULL, infoGlobal.m_timeStep ); + } + + bool parallelBodySetup = true; + if (parallelBodySetup) + { + ConvertBodiesLoop loop(this, bodies, numBodies, infoGlobal); + int grainSize = 40; + btParallelFor(0, numBodies, grainSize, loop); + } + else + { + internalConvertBodies( bodies, 0, numBodies, infoGlobal ); + } +} + + +btScalar btSequentialImpulseConstraintSolverMt::solveGroupCacheFriendlySetup( + btCollisionObject** bodies, + int numBodies, + btPersistentManifold** manifoldPtr, + int numManifolds, + btTypedConstraint** constraints, + int numConstraints, + const btContactSolverInfo& infoGlobal, + btIDebugDraw* debugDrawer + ) +{ + m_numFrictionDirections = (infoGlobal.m_solverMode & SOLVER_USE_2_FRICTION_DIRECTIONS) ? 2 : 1; + m_useBatching = false; + if ( numManifolds >= s_minimumContactManifoldsForBatching && + (s_allowNestedParallelForLoops || !btThreadsAreRunning()) + ) + { + m_useBatching = true; + m_batchedContactConstraints.m_debugDrawer = debugDrawer; + m_batchedJointConstraints.m_debugDrawer = debugDrawer; + } + btSequentialImpulseConstraintSolver::solveGroupCacheFriendlySetup( bodies, + numBodies, + manifoldPtr, + numManifolds, + constraints, + numConstraints, + infoGlobal, + debugDrawer + ); + return 0.0f; +} + + +btScalar btSequentialImpulseConstraintSolverMt::resolveMultipleContactSplitPenetrationImpulseConstraints( const btAlignedObjectArray& consIndices, int batchBegin, int batchEnd ) +{ + btScalar leastSquaresResidual = 0.f; + for ( int iiCons = batchBegin; iiCons < batchEnd; ++iiCons ) + { + int iCons = consIndices[ iiCons ]; + const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[ iCons ]; + btSolverBody& bodyA = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdA ]; + btSolverBody& bodyB = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdB ]; + btScalar residual = resolveSplitPenetrationImpulse( bodyA, bodyB, solveManifold ); + leastSquaresResidual += residual*residual; + } + return leastSquaresResidual; +} + + +struct ContactSplitPenetrationImpulseSolverLoop : public btIParallelSumBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + const btBatchedConstraints* m_bc; + + ContactSplitPenetrationImpulseSolverLoop( btSequentialImpulseConstraintSolverMt* solver, const btBatchedConstraints* bc ) + { + m_solver = solver; + m_bc = bc; + } + btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + BT_PROFILE( "ContactSplitPenetrationImpulseSolverLoop" ); + btScalar sum = 0; + for ( int iBatch = iBegin; iBatch < iEnd; ++iBatch ) + { + const btBatchedConstraints::Range& batch = m_bc->m_batches[ iBatch ]; + sum += m_solver->resolveMultipleContactSplitPenetrationImpulseConstraints( m_bc->m_constraintIndices, batch.begin, batch.end ); + } + return sum; + } +}; + + +void btSequentialImpulseConstraintSolverMt::solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) +{ + BT_PROFILE("solveGroupCacheFriendlySplitImpulseIterations"); + if (infoGlobal.m_splitImpulse) + { + for ( int iteration = 0; iteration < infoGlobal.m_numIterations; iteration++ ) + { + btScalar leastSquaresResidual = 0.f; + if (m_useBatching) + { + const btBatchedConstraints& batchedCons = m_batchedContactConstraints; + ContactSplitPenetrationImpulseSolverLoop loop( this, &batchedCons ); + btScalar leastSquaresResidual = 0.f; + for ( int iiPhase = 0; iiPhase < batchedCons.m_phases.size(); ++iiPhase ) + { + int iPhase = batchedCons.m_phaseOrder[ iiPhase ]; + const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ]; + int grainSize = 8; + leastSquaresResidual += btParallelSum( phase.begin, phase.end, grainSize, loop ); + } + } + else + { + // non-batched + leastSquaresResidual = resolveMultipleContactSplitPenetrationImpulseConstraints(m_orderTmpConstraintPool, 0, m_tmpSolverContactConstraintPool.size()); + } + if ( leastSquaresResidual <= infoGlobal.m_leastSquaresResidualThreshold || iteration >= ( infoGlobal.m_numIterations - 1 ) ) + { +#ifdef VERBOSE_RESIDUAL_PRINTF + printf( "residual = %f at iteration #%d\n", leastSquaresResidual, iteration ); +#endif + break; + } + } + } +} + + +btScalar btSequentialImpulseConstraintSolverMt::solveSingleIteration(int iteration, btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds, btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) +{ + if ( !m_useBatching ) + { + return btSequentialImpulseConstraintSolver::solveSingleIteration( iteration, bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer ); + } + BT_PROFILE( "solveSingleIterationMt" ); + btScalar leastSquaresResidual = 0.f; + + if (infoGlobal.m_solverMode & SOLVER_RANDMIZE_ORDER) + { + if (1) // uncomment this for a bit less random ((iteration & 7) == 0) + { + randomizeConstraintOrdering(iteration, infoGlobal.m_numIterations); + } + } + + { + ///solve all joint constraints + leastSquaresResidual += resolveAllJointConstraints(iteration); + + if (iteration< infoGlobal.m_numIterations) + { + // this loop is only used for cone-twist constraints, + // it would be nice to skip this loop if none of the constraints need it + if ( m_useObsoleteJointConstraints ) + { + for ( int j = 0; jisEnabled() ) + { + int bodyAid = getOrInitSolverBody( constraints[ j ]->getRigidBodyA(), infoGlobal.m_timeStep ); + int bodyBid = getOrInitSolverBody( constraints[ j ]->getRigidBodyB(), infoGlobal.m_timeStep ); + btSolverBody& bodyA = m_tmpSolverBodyPool[ bodyAid ]; + btSolverBody& bodyB = m_tmpSolverBodyPool[ bodyBid ]; + constraints[ j ]->solveConstraintObsolete( bodyA, bodyB, infoGlobal.m_timeStep ); + } + } + } + + if (infoGlobal.m_solverMode & SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS) + { + // solve all contact, contact-friction, and rolling friction constraints interleaved + leastSquaresResidual += resolveAllContactConstraintsInterleaved(); + } + else//SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS + { + // don't interleave them + // solve all contact constraints + leastSquaresResidual += resolveAllContactConstraints(); + + // solve all contact friction constraints + leastSquaresResidual += resolveAllContactFrictionConstraints(); + + // solve all rolling friction constraints + leastSquaresResidual += resolveAllRollingFrictionConstraints(); + } + } + } + return leastSquaresResidual; +} + + +btScalar btSequentialImpulseConstraintSolverMt::resolveMultipleJointConstraints( const btAlignedObjectArray& consIndices, int batchBegin, int batchEnd, int iteration ) +{ + btScalar leastSquaresResidual = 0.f; + for ( int iiCons = batchBegin; iiCons < batchEnd; ++iiCons ) + { + int iCons = consIndices[ iiCons ]; + const btSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[ iCons ]; + if ( iteration < constraint.m_overrideNumSolverIterations ) + { + btSolverBody& bodyA = m_tmpSolverBodyPool[ constraint.m_solverBodyIdA ]; + btSolverBody& bodyB = m_tmpSolverBodyPool[ constraint.m_solverBodyIdB ]; + btScalar residual = resolveSingleConstraintRowGeneric( bodyA, bodyB, constraint ); + leastSquaresResidual += residual*residual; + } + } + return leastSquaresResidual; +} + + +btScalar btSequentialImpulseConstraintSolverMt::resolveMultipleContactConstraints( const btAlignedObjectArray& consIndices, int batchBegin, int batchEnd ) +{ + btScalar leastSquaresResidual = 0.f; + for ( int iiCons = batchBegin; iiCons < batchEnd; ++iiCons ) + { + int iCons = consIndices[ iiCons ]; + const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[ iCons ]; + btSolverBody& bodyA = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdA ]; + btSolverBody& bodyB = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdB ]; + btScalar residual = resolveSingleConstraintRowLowerLimit( bodyA, bodyB, solveManifold ); + leastSquaresResidual += residual*residual; + } + return leastSquaresResidual; +} + + +btScalar btSequentialImpulseConstraintSolverMt::resolveMultipleContactFrictionConstraints( const btAlignedObjectArray& consIndices, int batchBegin, int batchEnd ) +{ + btScalar leastSquaresResidual = 0.f; + for ( int iiCons = batchBegin; iiCons < batchEnd; ++iiCons ) + { + int iContact = consIndices[ iiCons ]; + btScalar totalImpulse = m_tmpSolverContactConstraintPool[ iContact ].m_appliedImpulse; + + // apply sliding friction + if ( totalImpulse > 0.0f ) + { + int iBegin = iContact * m_numFrictionDirections; + int iEnd = iBegin + m_numFrictionDirections; + for ( int iFriction = iBegin; iFriction < iEnd; ++iFriction ) + { + btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[ iFriction++ ]; + btAssert( solveManifold.m_frictionIndex == iContact ); + + solveManifold.m_lowerLimit = -( solveManifold.m_friction*totalImpulse ); + solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse; + + btSolverBody& bodyA = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdA ]; + btSolverBody& bodyB = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdB ]; + btScalar residual = resolveSingleConstraintRowGeneric( bodyA, bodyB, solveManifold ); + leastSquaresResidual += residual*residual; + } + } + } + return leastSquaresResidual; +} + + +btScalar btSequentialImpulseConstraintSolverMt::resolveMultipleContactRollingFrictionConstraints( const btAlignedObjectArray& consIndices, int batchBegin, int batchEnd ) +{ + btScalar leastSquaresResidual = 0.f; + for ( int iiCons = batchBegin; iiCons < batchEnd; ++iiCons ) + { + int iContact = consIndices[ iiCons ]; + int iFirstRollingFriction = m_rollingFrictionIndexTable[ iContact ]; + if ( iFirstRollingFriction >= 0 ) + { + btScalar totalImpulse = m_tmpSolverContactConstraintPool[ iContact ].m_appliedImpulse; + // apply rolling friction + if ( totalImpulse > 0.0f ) + { + int iBegin = iFirstRollingFriction; + int iEnd = iBegin + 3; + for ( int iRollingFric = iBegin; iRollingFric < iEnd; ++iRollingFric ) + { + btSolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[ iRollingFric ]; + if ( rollingFrictionConstraint.m_frictionIndex != iContact ) + { + break; + } + btScalar rollingFrictionMagnitude = rollingFrictionConstraint.m_friction*totalImpulse; + if ( rollingFrictionMagnitude > rollingFrictionConstraint.m_friction ) + { + rollingFrictionMagnitude = rollingFrictionConstraint.m_friction; + } + + rollingFrictionConstraint.m_lowerLimit = -rollingFrictionMagnitude; + rollingFrictionConstraint.m_upperLimit = rollingFrictionMagnitude; + + btScalar residual = resolveSingleConstraintRowGeneric( m_tmpSolverBodyPool[ rollingFrictionConstraint.m_solverBodyIdA ], m_tmpSolverBodyPool[ rollingFrictionConstraint.m_solverBodyIdB ], rollingFrictionConstraint ); + leastSquaresResidual += residual*residual; + } + } + } + } + return leastSquaresResidual; +} + + +btScalar btSequentialImpulseConstraintSolverMt::resolveMultipleContactConstraintsInterleaved( const btAlignedObjectArray& contactIndices, + int batchBegin, + int batchEnd + ) +{ + btScalar leastSquaresResidual = 0.f; + int numPoolConstraints = m_tmpSolverContactConstraintPool.size(); + + for ( int iiCons = batchBegin; iiCons < batchEnd; iiCons++ ) + { + btScalar totalImpulse = 0; + int iContact = contactIndices[ iiCons ]; + // apply penetration constraint + { + const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[ iContact ]; + btScalar residual = resolveSingleConstraintRowLowerLimit( m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdA ], m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdB ], solveManifold ); + leastSquaresResidual += residual*residual; + totalImpulse = solveManifold.m_appliedImpulse; + } + + // apply sliding friction + if ( totalImpulse > 0.0f ) + { + int iBegin = iContact * m_numFrictionDirections; + int iEnd = iBegin + m_numFrictionDirections; + for ( int iFriction = iBegin; iFriction < iEnd; ++iFriction ) + { + btSolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[ iFriction ]; + btAssert( solveManifold.m_frictionIndex == iContact ); + + solveManifold.m_lowerLimit = -( solveManifold.m_friction*totalImpulse ); + solveManifold.m_upperLimit = solveManifold.m_friction*totalImpulse; + + btSolverBody& bodyA = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdA ]; + btSolverBody& bodyB = m_tmpSolverBodyPool[ solveManifold.m_solverBodyIdB ]; + btScalar residual = resolveSingleConstraintRowGeneric( bodyA, bodyB, solveManifold ); + leastSquaresResidual += residual*residual; + } + } + + // apply rolling friction + int iFirstRollingFriction = m_rollingFrictionIndexTable[ iContact ]; + if ( totalImpulse > 0.0f && iFirstRollingFriction >= 0) + { + int iBegin = iFirstRollingFriction; + int iEnd = iBegin + 3; + for ( int iRollingFric = iBegin; iRollingFric < iEnd; ++iRollingFric ) + { + btSolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[ iRollingFric ]; + if ( rollingFrictionConstraint.m_frictionIndex != iContact ) + { + break; + } + btScalar rollingFrictionMagnitude = rollingFrictionConstraint.m_friction*totalImpulse; + if ( rollingFrictionMagnitude > rollingFrictionConstraint.m_friction ) + { + rollingFrictionMagnitude = rollingFrictionConstraint.m_friction; + } + + rollingFrictionConstraint.m_lowerLimit = -rollingFrictionMagnitude; + rollingFrictionConstraint.m_upperLimit = rollingFrictionMagnitude; + + btScalar residual = resolveSingleConstraintRowGeneric( m_tmpSolverBodyPool[ rollingFrictionConstraint.m_solverBodyIdA ], m_tmpSolverBodyPool[ rollingFrictionConstraint.m_solverBodyIdB ], rollingFrictionConstraint ); + leastSquaresResidual += residual*residual; + } + } + } + return leastSquaresResidual; +} + + +void btSequentialImpulseConstraintSolverMt::randomizeBatchedConstraintOrdering( btBatchedConstraints* batchedConstraints ) +{ + btBatchedConstraints& bc = *batchedConstraints; + // randomize ordering of phases + for ( int ii = 1; ii < bc.m_phaseOrder.size(); ++ii ) + { + int iSwap = btRandInt2( ii + 1 ); + bc.m_phaseOrder.swap( ii, iSwap ); + } + + // for each batch, + for ( int iBatch = 0; iBatch < bc.m_batches.size(); ++iBatch ) + { + // randomize ordering of constraints within the batch + const btBatchedConstraints::Range& batch = bc.m_batches[ iBatch ]; + for ( int iiCons = batch.begin; iiCons < batch.end; ++iiCons ) + { + int iSwap = batch.begin + btRandInt2( iiCons - batch.begin + 1 ); + btAssert(iSwap >= batch.begin && iSwap < batch.end); + bc.m_constraintIndices.swap( iiCons, iSwap ); + } + } +} + + +void btSequentialImpulseConstraintSolverMt::randomizeConstraintOrdering(int iteration, int numIterations) +{ + // randomize ordering of joint constraints + randomizeBatchedConstraintOrdering( &m_batchedJointConstraints ); + + //contact/friction constraints are not solved more than numIterations + if ( iteration < numIterations ) + { + randomizeBatchedConstraintOrdering( &m_batchedContactConstraints ); + } +} + + +struct JointSolverLoop : public btIParallelSumBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + const btBatchedConstraints* m_bc; + int m_iteration; + + JointSolverLoop( btSequentialImpulseConstraintSolverMt* solver, const btBatchedConstraints* bc, int iteration ) + { + m_solver = solver; + m_bc = bc; + m_iteration = iteration; + } + btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + BT_PROFILE( "JointSolverLoop" ); + btScalar sum = 0; + for ( int iBatch = iBegin; iBatch < iEnd; ++iBatch ) + { + const btBatchedConstraints::Range& batch = m_bc->m_batches[ iBatch ]; + sum += m_solver->resolveMultipleJointConstraints( m_bc->m_constraintIndices, batch.begin, batch.end, m_iteration ); + } + return sum; + } +}; + + +btScalar btSequentialImpulseConstraintSolverMt::resolveAllJointConstraints(int iteration) +{ + BT_PROFILE( "resolveAllJointConstraints" ); + const btBatchedConstraints& batchedCons = m_batchedJointConstraints; + JointSolverLoop loop( this, &batchedCons, iteration ); + btScalar leastSquaresResidual = 0.f; + for ( int iiPhase = 0; iiPhase < batchedCons.m_phases.size(); ++iiPhase ) + { + int iPhase = batchedCons.m_phaseOrder[ iiPhase ]; + const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ]; + int grainSize = 1; + leastSquaresResidual += btParallelSum( phase.begin, phase.end, grainSize, loop ); + } + return leastSquaresResidual; +} + + +struct ContactSolverLoop : public btIParallelSumBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + const btBatchedConstraints* m_bc; + + ContactSolverLoop( btSequentialImpulseConstraintSolverMt* solver, const btBatchedConstraints* bc ) + { + m_solver = solver; + m_bc = bc; + } + btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + BT_PROFILE( "ContactSolverLoop" ); + btScalar sum = 0; + for ( int iBatch = iBegin; iBatch < iEnd; ++iBatch ) + { + const btBatchedConstraints::Range& batch = m_bc->m_batches[ iBatch ]; + sum += m_solver->resolveMultipleContactConstraints( m_bc->m_constraintIndices, batch.begin, batch.end ); + } + return sum; + } +}; + + +btScalar btSequentialImpulseConstraintSolverMt::resolveAllContactConstraints() +{ + BT_PROFILE( "resolveAllContactConstraints" ); + const btBatchedConstraints& batchedCons = m_batchedContactConstraints; + ContactSolverLoop loop( this, &batchedCons ); + btScalar leastSquaresResidual = 0.f; + for ( int iiPhase = 0; iiPhase < batchedCons.m_phases.size(); ++iiPhase ) + { + int iPhase = batchedCons.m_phaseOrder[ iiPhase ]; + const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ]; + int grainSize = batchedCons.m_phaseGrainSize[iPhase]; + leastSquaresResidual += btParallelSum( phase.begin, phase.end, grainSize, loop ); + } + return leastSquaresResidual; +} + + +struct ContactFrictionSolverLoop : public btIParallelSumBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + const btBatchedConstraints* m_bc; + + ContactFrictionSolverLoop( btSequentialImpulseConstraintSolverMt* solver, const btBatchedConstraints* bc ) + { + m_solver = solver; + m_bc = bc; + } + btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + BT_PROFILE( "ContactFrictionSolverLoop" ); + btScalar sum = 0; + for ( int iBatch = iBegin; iBatch < iEnd; ++iBatch ) + { + const btBatchedConstraints::Range& batch = m_bc->m_batches[ iBatch ]; + sum += m_solver->resolveMultipleContactFrictionConstraints( m_bc->m_constraintIndices, batch.begin, batch.end ); + } + return sum; + } +}; + + +btScalar btSequentialImpulseConstraintSolverMt::resolveAllContactFrictionConstraints() +{ + BT_PROFILE( "resolveAllContactFrictionConstraints" ); + const btBatchedConstraints& batchedCons = m_batchedContactConstraints; + ContactFrictionSolverLoop loop( this, &batchedCons ); + btScalar leastSquaresResidual = 0.f; + for ( int iiPhase = 0; iiPhase < batchedCons.m_phases.size(); ++iiPhase ) + { + int iPhase = batchedCons.m_phaseOrder[ iiPhase ]; + const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ]; + int grainSize = batchedCons.m_phaseGrainSize[iPhase]; + leastSquaresResidual += btParallelSum( phase.begin, phase.end, grainSize, loop ); + } + return leastSquaresResidual; +} + + +struct InterleavedContactSolverLoop : public btIParallelSumBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + const btBatchedConstraints* m_bc; + + InterleavedContactSolverLoop( btSequentialImpulseConstraintSolverMt* solver, const btBatchedConstraints* bc ) + { + m_solver = solver; + m_bc = bc; + } + btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + BT_PROFILE( "InterleavedContactSolverLoop" ); + btScalar sum = 0; + for ( int iBatch = iBegin; iBatch < iEnd; ++iBatch ) + { + const btBatchedConstraints::Range& batch = m_bc->m_batches[ iBatch ]; + sum += m_solver->resolveMultipleContactConstraintsInterleaved( m_bc->m_constraintIndices, batch.begin, batch.end ); + } + return sum; + } +}; + + +btScalar btSequentialImpulseConstraintSolverMt::resolveAllContactConstraintsInterleaved() +{ + BT_PROFILE( "resolveAllContactConstraintsInterleaved" ); + const btBatchedConstraints& batchedCons = m_batchedContactConstraints; + InterleavedContactSolverLoop loop( this, &batchedCons ); + btScalar leastSquaresResidual = 0.f; + for ( int iiPhase = 0; iiPhase < batchedCons.m_phases.size(); ++iiPhase ) + { + int iPhase = batchedCons.m_phaseOrder[ iiPhase ]; + const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ]; + int grainSize = 1; + leastSquaresResidual += btParallelSum( phase.begin, phase.end, grainSize, loop ); + } + return leastSquaresResidual; +} + + +struct ContactRollingFrictionSolverLoop : public btIParallelSumBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + const btBatchedConstraints* m_bc; + + ContactRollingFrictionSolverLoop( btSequentialImpulseConstraintSolverMt* solver, const btBatchedConstraints* bc ) + { + m_solver = solver; + m_bc = bc; + } + btScalar sumLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + BT_PROFILE( "ContactFrictionSolverLoop" ); + btScalar sum = 0; + for ( int iBatch = iBegin; iBatch < iEnd; ++iBatch ) + { + const btBatchedConstraints::Range& batch = m_bc->m_batches[ iBatch ]; + sum += m_solver->resolveMultipleContactRollingFrictionConstraints( m_bc->m_constraintIndices, batch.begin, batch.end ); + } + return sum; + } +}; + + +btScalar btSequentialImpulseConstraintSolverMt::resolveAllRollingFrictionConstraints() +{ + BT_PROFILE( "resolveAllRollingFrictionConstraints" ); + btScalar leastSquaresResidual = 0.f; + // + // We do not generate batches for rolling friction constraints. We assume that + // one of two cases is true: + // + // 1. either most bodies in the simulation have rolling friction, in which case we can use the + // batches for contacts and use a lookup table to translate contact indices to rolling friction + // (ignoring any contact indices that don't map to a rolling friction constraint). As long as + // most contacts have a corresponding rolling friction constraint, this should parallelize well. + // + // -OR- + // + // 2. few bodies in the simulation have rolling friction, so it is not worth trying to use the + // batches from contacts as most of the contacts won't have corresponding rolling friction + // constraints and most threads would end up doing very little work. Most of the time would + // go to threading overhead, so we don't bother with threading. + // + int numRollingFrictionPoolConstraints = m_tmpSolverContactRollingFrictionConstraintPool.size(); + if (numRollingFrictionPoolConstraints >= m_tmpSolverContactConstraintPool.size()) + { + // use batching if there are many rolling friction constraints + const btBatchedConstraints& batchedCons = m_batchedContactConstraints; + ContactRollingFrictionSolverLoop loop( this, &batchedCons ); + btScalar leastSquaresResidual = 0.f; + for ( int iiPhase = 0; iiPhase < batchedCons.m_phases.size(); ++iiPhase ) + { + int iPhase = batchedCons.m_phaseOrder[ iiPhase ]; + const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ]; + int grainSize = 1; + leastSquaresResidual += btParallelSum( phase.begin, phase.end, grainSize, loop ); + } + } + else + { + // no batching, also ignores SOLVER_RANDMIZE_ORDER + for ( int j = 0; j < numRollingFrictionPoolConstraints; j++ ) + { + btSolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[ j ]; + if ( rollingFrictionConstraint.m_frictionIndex >= 0 ) + { + btScalar totalImpulse = m_tmpSolverContactConstraintPool[ rollingFrictionConstraint.m_frictionIndex ].m_appliedImpulse; + if ( totalImpulse > 0.0f ) + { + btScalar rollingFrictionMagnitude = rollingFrictionConstraint.m_friction*totalImpulse; + if ( rollingFrictionMagnitude > rollingFrictionConstraint.m_friction ) + rollingFrictionMagnitude = rollingFrictionConstraint.m_friction; + + rollingFrictionConstraint.m_lowerLimit = -rollingFrictionMagnitude; + rollingFrictionConstraint.m_upperLimit = rollingFrictionMagnitude; + + btScalar residual = resolveSingleConstraintRowGeneric( m_tmpSolverBodyPool[ rollingFrictionConstraint.m_solverBodyIdA ], m_tmpSolverBodyPool[ rollingFrictionConstraint.m_solverBodyIdB ], rollingFrictionConstraint ); + leastSquaresResidual += residual*residual; + } + } + } + } + return leastSquaresResidual; +} + + +void btSequentialImpulseConstraintSolverMt::internalWriteBackContacts( int iBegin, int iEnd, const btContactSolverInfo& infoGlobal ) +{ + BT_PROFILE("internalWriteBackContacts"); + writeBackContacts(iBegin, iEnd, infoGlobal); + //for ( int iContact = iBegin; iContact < iEnd; ++iContact) + //{ + // const btSolverConstraint& contactConstraint = m_tmpSolverContactConstraintPool[ iContact ]; + // btManifoldPoint* pt = (btManifoldPoint*) contactConstraint.m_originalContactPoint; + // btAssert( pt ); + // pt->m_appliedImpulse = contactConstraint.m_appliedImpulse; + // pt->m_appliedImpulseLateral1 = m_tmpSolverContactFrictionConstraintPool[ contactConstraint.m_frictionIndex ].m_appliedImpulse; + // if ( m_numFrictionDirections == 2 ) + // { + // pt->m_appliedImpulseLateral2 = m_tmpSolverContactFrictionConstraintPool[ contactConstraint.m_frictionIndex + 1 ].m_appliedImpulse; + // } + //} +} + + +void btSequentialImpulseConstraintSolverMt::internalWriteBackJoints( int iBegin, int iEnd, const btContactSolverInfo& infoGlobal ) +{ + BT_PROFILE("internalWriteBackJoints"); + writeBackJoints(iBegin, iEnd, infoGlobal); +} + + +void btSequentialImpulseConstraintSolverMt::internalWriteBackBodies( int iBegin, int iEnd, const btContactSolverInfo& infoGlobal ) +{ + BT_PROFILE("internalWriteBackBodies"); + writeBackBodies( iBegin, iEnd, infoGlobal ); +} + + +struct WriteContactPointsLoop : public btIParallelForBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + const btContactSolverInfo* m_infoGlobal; + + WriteContactPointsLoop( btSequentialImpulseConstraintSolverMt* solver, const btContactSolverInfo& infoGlobal ) + { + m_solver = solver; + m_infoGlobal = &infoGlobal; + } + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + m_solver->internalWriteBackContacts( iBegin, iEnd, *m_infoGlobal ); + } +}; + + +struct WriteJointsLoop : public btIParallelForBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + const btContactSolverInfo* m_infoGlobal; + + WriteJointsLoop( btSequentialImpulseConstraintSolverMt* solver, const btContactSolverInfo& infoGlobal ) + { + m_solver = solver; + m_infoGlobal = &infoGlobal; + } + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + m_solver->internalWriteBackJoints( iBegin, iEnd, *m_infoGlobal ); + } +}; + + +struct WriteBodiesLoop : public btIParallelForBody +{ + btSequentialImpulseConstraintSolverMt* m_solver; + const btContactSolverInfo* m_infoGlobal; + + WriteBodiesLoop( btSequentialImpulseConstraintSolverMt* solver, const btContactSolverInfo& infoGlobal ) + { + m_solver = solver; + m_infoGlobal = &infoGlobal; + } + void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE + { + m_solver->internalWriteBackBodies( iBegin, iEnd, *m_infoGlobal ); + } +}; + + +btScalar btSequentialImpulseConstraintSolverMt::solveGroupCacheFriendlyFinish(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal) +{ + BT_PROFILE("solveGroupCacheFriendlyFinish"); + + if (infoGlobal.m_solverMode & SOLVER_USE_WARMSTARTING) + { + WriteContactPointsLoop loop( this, infoGlobal ); + int grainSize = 500; + btParallelFor( 0, m_tmpSolverContactConstraintPool.size(), grainSize, loop ); + } + + { + WriteJointsLoop loop( this, infoGlobal ); + int grainSize = 400; + btParallelFor( 0, m_tmpSolverNonContactConstraintPool.size(), grainSize, loop ); + } + { + WriteBodiesLoop loop( this, infoGlobal ); + int grainSize = 100; + btParallelFor( 0, m_tmpSolverBodyPool.size(), grainSize, loop ); + } + + m_tmpSolverContactConstraintPool.resizeNoInitialize(0); + m_tmpSolverNonContactConstraintPool.resizeNoInitialize(0); + m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize(0); + m_tmpSolverContactRollingFrictionConstraintPool.resizeNoInitialize(0); + + m_tmpSolverBodyPool.resizeNoInitialize(0); + return 0.f; +} + diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h new file mode 100644 index 000000000..0577d8d2d --- /dev/null +++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h @@ -0,0 +1,154 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/ + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef BT_SEQUENTIAL_IMPULSE_CONSTRAINT_SOLVER_MT_H +#define BT_SEQUENTIAL_IMPULSE_CONSTRAINT_SOLVER_MT_H + +#include "btSequentialImpulseConstraintSolver.h" +#include "btBatchedConstraints.h" +#include "LinearMath/btThreads.h" + +/// +/// btSequentialImpulseConstraintSolverMt +/// +/// A multithreaded variant of the sequential impulse constraint solver. The constraints to be solved are grouped into +/// batches and phases where each batch of constraints within a given phase can be solved in parallel with the rest. +/// Ideally we want as few phases as possible, and each phase should have many batches, and all of the batches should +/// have about the same number of constraints. +/// This method works best on a large island of many constraints. +/// +/// Supports all of the features of the normal sequential impulse solver such as: +/// - split penetration impulse +/// - rolling friction +/// - interleaving constraints +/// - warmstarting +/// - 2 friction directions +/// - randomized constraint ordering +/// - early termination when leastSquaresResidualThreshold is satisfied +/// +/// When the SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS flag is enabled, unlike the normal SequentialImpulse solver, +/// the rolling friction is interleaved as well. +/// Interleaving the contact penetration constraints with friction reduces the number of parallel loops that need to be done, +/// which reduces threading overhead so it can be a performance win, however, it does seem to produce a less stable simulation, +/// at least on stacks of blocks. +/// +/// When the SOLVER_RANDMIZE_ORDER flag is enabled, the ordering of phases, and the ordering of constraints within each batch +/// is randomized, however it does not swap constraints between batches. +/// This is to avoid regenerating the batches for each solver iteration which would be quite costly in performance. +/// +/// Note that a non-zero leastSquaresResidualThreshold could possibly affect the determinism of the simulation +/// if the task scheduler's parallelSum operation is non-deterministic. The parallelSum operation can be non-deterministic +/// because floating point addition is not associative due to rounding errors. +/// The task scheduler can and should ensure that the result of any parallelSum operation is deterministic. +/// +ATTRIBUTE_ALIGNED16(class) btSequentialImpulseConstraintSolverMt : public btSequentialImpulseConstraintSolver +{ +public: + virtual void solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE; + virtual btScalar solveSingleIteration(int iteration, btCollisionObject** bodies ,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE; + virtual btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer) BT_OVERRIDE; + virtual btScalar solveGroupCacheFriendlyFinish(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal) BT_OVERRIDE; + + // temp struct used to collect info from persistent manifolds into a cache-friendly struct using multiple threads + struct btContactManifoldCachedInfo + { + static const int MAX_NUM_CONTACT_POINTS = 4; + + int numTouchingContacts; + int solverBodyIds[ 2 ]; + int contactIndex; + int rollingFrictionIndex; + bool contactHasRollingFriction[ MAX_NUM_CONTACT_POINTS ]; + btManifoldPoint* contactPoints[ MAX_NUM_CONTACT_POINTS ]; + }; + // temp struct used for setting up joint constraints in parallel + struct JointParams + { + int m_solverConstraint; + int m_solverBodyA; + int m_solverBodyB; + }; + void internalInitMultipleJoints(btTypedConstraint** constraints, int iBegin, int iEnd); + void internalConvertMultipleJoints( const btAlignedObjectArray& jointParamsArray, btTypedConstraint** constraints, int iBegin, int iEnd, const btContactSolverInfo& infoGlobal ); + + // parameters to control batching + static bool s_allowNestedParallelForLoops; // whether to allow nested parallel operations + static int s_minimumContactManifoldsForBatching; // don't even try to batch if fewer manifolds than this + static btBatchedConstraints::BatchingMethod s_contactBatchingMethod; + static btBatchedConstraints::BatchingMethod s_jointBatchingMethod; + static int s_minBatchSize; // desired number of constraints per batch + static int s_maxBatchSize; + +protected: + static const int CACHE_LINE_SIZE = 64; + + btBatchedConstraints m_batchedContactConstraints; + btBatchedConstraints m_batchedJointConstraints; + int m_numFrictionDirections; + bool m_useBatching; + bool m_useObsoleteJointConstraints; + btAlignedObjectArray m_manifoldCachedInfoArray; + btAlignedObjectArray m_rollingFrictionIndexTable; // lookup table mapping contact index to rolling friction index + btSpinMutex m_bodySolverArrayMutex; + char m_antiFalseSharingPadding[CACHE_LINE_SIZE]; // padding to keep mutexes in separate cachelines + btSpinMutex m_kinematicBodyUniqueIdToSolverBodyTableMutex; + btAlignedObjectArray m_scratchMemory; + + virtual void randomizeConstraintOrdering( int iteration, int numIterations ); + virtual btScalar resolveAllJointConstraints( int iteration ); + virtual btScalar resolveAllContactConstraints(); + virtual btScalar resolveAllContactFrictionConstraints(); + virtual btScalar resolveAllContactConstraintsInterleaved(); + virtual btScalar resolveAllRollingFrictionConstraints(); + + virtual void setupBatchedContactConstraints(); + virtual void setupBatchedJointConstraints(); + virtual void convertJoints(btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal) BT_OVERRIDE; + virtual void convertContacts(btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal) BT_OVERRIDE; + virtual void convertBodies(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal) BT_OVERRIDE; + + int getOrInitSolverBodyThreadsafe(btCollisionObject& body, btScalar timeStep); + void allocAllContactConstraints(btPersistentManifold** manifoldPtr, int numManifolds, const btContactSolverInfo& infoGlobal); + void setupAllContactConstraints(const btContactSolverInfo& infoGlobal); + void randomizeBatchedConstraintOrdering( btBatchedConstraints* batchedConstraints ); + +public: + + BT_DECLARE_ALIGNED_ALLOCATOR(); + + btSequentialImpulseConstraintSolverMt(); + virtual ~btSequentialImpulseConstraintSolverMt(); + + btScalar resolveMultipleJointConstraints( const btAlignedObjectArray& consIndices, int batchBegin, int batchEnd, int iteration ); + btScalar resolveMultipleContactConstraints( const btAlignedObjectArray& consIndices, int batchBegin, int batchEnd ); + btScalar resolveMultipleContactSplitPenetrationImpulseConstraints( const btAlignedObjectArray& consIndices, int batchBegin, int batchEnd ); + btScalar resolveMultipleContactFrictionConstraints( const btAlignedObjectArray& consIndices, int batchBegin, int batchEnd ); + btScalar resolveMultipleContactRollingFrictionConstraints( const btAlignedObjectArray& consIndices, int batchBegin, int batchEnd ); + btScalar resolveMultipleContactConstraintsInterleaved( const btAlignedObjectArray& consIndices, int batchBegin, int batchEnd ); + + void internalCollectContactManifoldCachedInfo(btContactManifoldCachedInfo* cachedInfoArray, btPersistentManifold** manifold, int numManifolds, const btContactSolverInfo& infoGlobal); + void internalAllocContactConstraints(const btContactManifoldCachedInfo* cachedInfoArray, int numManifolds); + void internalSetupContactConstraints(int iContact, const btContactSolverInfo& infoGlobal); + void internalConvertBodies(btCollisionObject** bodies, int iBegin, int iEnd, const btContactSolverInfo& infoGlobal); + void internalWriteBackContacts(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal); + void internalWriteBackJoints(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal); + void internalWriteBackBodies(int iBegin, int iEnd, const btContactSolverInfo& infoGlobal); +}; + + + + +#endif //BT_SEQUENTIAL_IMPULSE_CONSTRAINT_SOLVER_MT_H + diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp index 1d10bad92..330bccb87 100644 --- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp +++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp @@ -325,3 +325,14 @@ void btDiscreteDynamicsWorldMt::integrateTransforms( btScalar timeStep ) } } + +int btDiscreteDynamicsWorldMt::stepSimulation( btScalar timeStep, int maxSubSteps, btScalar fixedTimeStep ) +{ + int numSubSteps = btDiscreteDynamicsWorld::stepSimulation(timeStep, maxSubSteps, fixedTimeStep); + if (btITaskScheduler* scheduler = btGetTaskScheduler()) + { + // tell Bullet's threads to sleep, so other threads can run + scheduler->sleepWorkerThreadsHint(); + } + return numSubSteps; +} diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h index 2f144cdda..a8cc22dd0 100644 --- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h +++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h @@ -129,6 +129,8 @@ public: btCollisionConfiguration* collisionConfiguration ); virtual ~btDiscreteDynamicsWorldMt(); + + virtual int stepSimulation( btScalar timeStep, int maxSubSteps, btScalar fixedTimeStep ) BT_OVERRIDE; }; #endif //BT_DISCRETE_DYNAMICS_WORLD_H diff --git a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp index 65e1a140e..54ac39aaf 100644 --- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp +++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp @@ -22,6 +22,7 @@ subject to the following restrictions: #include "BulletCollision/CollisionDispatch/btCollisionObject.h" #include "BulletCollision/CollisionDispatch/btCollisionWorld.h" #include "BulletDynamics/ConstraintSolver/btTypedConstraint.h" +#include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.h" // for s_minimumContactManifoldsForBatching //#include #include "LinearMath/btQuickprof.h" @@ -589,14 +590,52 @@ struct UpdateIslandDispatcher : public btIParallelForBody } }; + void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray* islandsPtr, IslandCallback* callback ) { BT_PROFILE( "parallelIslandDispatch" ); - int grainSize = 1; // iterations per task + // + // if there are islands with many contacts, it may be faster to submit these + // large islands *serially* to a single parallel constraint solver, and then later + // submit the remaining smaller islands in parallel to multiple sequential solvers. + // + // Some task schedulers do not deal well with nested parallelFor loops. One implementation + // of OpenMP was actually slower than doing everything single-threaded. Intel TBB + // on the other hand, seems to do a pretty respectable job with it. + // + // When solving islands in parallel, the worst case performance happens when there + // is one very large island and then perhaps a smattering of very small + // islands -- one worker thread takes the large island and the remaining workers + // tear through the smaller islands and then sit idle waiting for the first worker + // to finish. Solving islands in parallel works best when there are numerous small + // islands, roughly equal in size. + // + // By contrast, the other approach -- the parallel constraint solver -- is only + // able to deliver a worthwhile speedup when the island is large. For smaller islands, + // it is difficult to extract a useful amount of parallelism -- the overhead of grouping + // the constraints into batches and sending the batches to worker threads can nullify + // any gains from parallelism. + // + UpdateIslandDispatcher dispatcher; dispatcher.islandsPtr = islandsPtr; dispatcher.callback = callback; - btParallelFor( 0, islandsPtr->size(), grainSize, dispatcher ); + // We take advantage of the fact the islands are sorted in order of decreasing size + int iBegin = 0; + while (iBegin < islandsPtr->size()) + { + btSimulationIslandManagerMt::Island* island = (*islandsPtr)[ iBegin ]; + if (island->manifoldArray.size() < btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching) + { + // OK to submit the rest of the array in parallel + break; + } + ++iBegin; + } + // serial dispatch for large islands (if any) + dispatcher.forLoop(0, iBegin); + // parallel dispatch for rest + btParallelFor( iBegin, islandsPtr->size(), 1, dispatcher ); } diff --git a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h index 9a781aaef..31a2053b4 100644 --- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h +++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h @@ -106,5 +106,7 @@ public: } }; +extern int gLargeIslandManifoldCount; + #endif //BT_SIMULATION_ISLAND_MANAGER_H diff --git a/src/LinearMath/CMakeLists.txt b/src/LinearMath/CMakeLists.txt index ede21d9a7..0c8c0133a 100644 --- a/src/LinearMath/CMakeLists.txt +++ b/src/LinearMath/CMakeLists.txt @@ -14,6 +14,9 @@ SET(LinearMath_SRCS btSerializer64.cpp btThreads.cpp btVector3.cpp + TaskScheduler/btTaskScheduler.cpp + TaskScheduler/btThreadSupportPosix.cpp + TaskScheduler/btThreadSupportWin32.cpp ) SET(LinearMath_HDRS @@ -44,6 +47,7 @@ SET(LinearMath_HDRS btTransform.h btTransformUtil.h btVector3.h + TaskScheduler/btThreadSupportInterface.h ) ADD_LIBRARY(LinearMath ${LinearMath_SRCS} ${LinearMath_HDRS}) diff --git a/src/LinearMath/TaskScheduler/btTaskScheduler.cpp b/src/LinearMath/TaskScheduler/btTaskScheduler.cpp new file mode 100644 index 000000000..e02458367 --- /dev/null +++ b/src/LinearMath/TaskScheduler/btTaskScheduler.cpp @@ -0,0 +1,619 @@ + +#include "LinearMath/btMinMax.h" +#include "LinearMath/btAlignedObjectArray.h" +#include "LinearMath/btThreads.h" +#include "LinearMath/btQuickprof.h" +#include +#include + + +typedef void( *btThreadFunc )( void* userPtr, void* lsMemory ); +typedef void* ( *btThreadLocalStorageFunc )(); + +#if BT_THREADSAFE + +#include "btThreadSupportInterface.h" + + + + +/// +/// getNumHardwareThreads() +/// +/// +/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine +/// +#if __cplusplus >= 201103L + +#include + +int getNumHardwareThreads() +{ + return std::thread::hardware_concurrency(); +} + +#elif defined( _WIN32 ) + +#define WIN32_LEAN_AND_MEAN + +#include + +int getNumHardwareThreads() +{ + // caps out at 32 + SYSTEM_INFO info; + GetSystemInfo( &info ); + return info.dwNumberOfProcessors; +} + +#else + +int getNumHardwareThreads() +{ + return 0; // don't know +} + +#endif + + +void btSpinPause() +{ +#if defined( _WIN32 ) + YieldProcessor(); +#endif +} + + +struct WorkerThreadStatus +{ + enum Type + { + kInvalid, + kWaitingForWork, + kWorking, + kSleeping, + }; +}; + + +struct IJob +{ + virtual void executeJob(int threadId) = 0; +}; + +class ParallelForJob : public IJob +{ + const btIParallelForBody* mBody; + int mBegin; + int mEnd; + +public: + ParallelForJob( int iBegin, int iEnd, const btIParallelForBody& body ) + { + mBody = &body; + mBegin = iBegin; + mEnd = iEnd; + } + virtual void executeJob(int threadId) BT_OVERRIDE + { + BT_PROFILE( "executeJob" ); + + // call the functor body to do the work + mBody->forLoop( mBegin, mEnd ); + } +}; + +static const int kCacheLineSize = 64; + +struct ThreadLocalSum +{ + btScalar mSum; + char mCachePadding[ kCacheLineSize - sizeof( btScalar ) ]; +}; + +class ParallelSumJob : public IJob +{ + const btIParallelSumBody* mBody; + ThreadLocalSum* mSumArray; + int mBegin; + int mEnd; + +public: + ParallelSumJob( int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalSum* sums ) + { + mBody = &body; + mSumArray = sums; + mBegin = iBegin; + mEnd = iEnd; + } + virtual void executeJob( int threadId ) BT_OVERRIDE + { + BT_PROFILE( "executeJob" ); + + // call the functor body to do the work + btScalar val = mBody->sumLoop( mBegin, mEnd ); + // by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision) + const float TRUNC_SCALE = float(1<<19); + val = floor(val*TRUNC_SCALE+0.5f)/TRUNC_SCALE; // truncate some bits + mSumArray[threadId].mSum += val; + } +}; + + +struct JobContext +{ + JobContext() + { + m_queueLock = NULL; + m_headIndex = 0; + m_tailIndex = 0; + m_workersShouldCheckQueue = false; + m_workersShouldSleep = false; + m_useSpinMutex = false; + m_coolDownTime = 1000; // 1000 microseconds + } + btCriticalSection* m_queueLock; + btSpinMutex m_mutex; + volatile bool m_workersShouldCheckQueue; + volatile bool m_workersShouldSleep; + + btAlignedObjectArray m_jobQueue; + bool m_queueIsEmpty; + int m_tailIndex; + int m_headIndex; + bool m_useSpinMutex; + unsigned int m_coolDownTime; + btClock m_clock; + + void lockQueue() + { + if ( m_useSpinMutex ) + { + m_mutex.lock(); + } + else + { + m_queueLock->lock(); + } + } + void unlockQueue() + { + if ( m_useSpinMutex ) + { + m_mutex.unlock(); + } + else + { + m_queueLock->unlock(); + } + } + void clearQueue() + { + lockQueue(); + m_headIndex = 0; + m_tailIndex = 0; + m_queueIsEmpty = true; + unlockQueue(); + m_jobQueue.resizeNoInitialize( 0 ); + } + void submitJob( IJob* job ) + { + m_jobQueue.push_back( job ); + lockQueue(); + m_tailIndex++; + m_queueIsEmpty = false; + unlockQueue(); + } + IJob* consumeJob() + { + if ( m_queueIsEmpty ) + { + // lock free path. even if this is taken erroneously it isn't harmful + return NULL; + } + IJob* job = NULL; + lockQueue(); + if ( !m_queueIsEmpty ) + { + job = m_jobQueue[ m_headIndex++ ]; + if ( m_headIndex == m_tailIndex ) + { + m_queueIsEmpty = true; + } + } + unlockQueue(); + return job; + } +}; + + +struct WorkerThreadLocalStorage +{ + int threadId; + WorkerThreadStatus::Type status; + int numJobsFinished; + btSpinMutex m_mutex; +}; + + +static void WorkerThreadFunc( void* userPtr, void* lsMemory ) +{ + BT_PROFILE( "WorkerThreadFunc" ); + WorkerThreadLocalStorage* localStorage = (WorkerThreadLocalStorage*) lsMemory; + JobContext* jobContext = (JobContext*) userPtr; + + bool shouldSleep = false; + while (! shouldSleep) + { + // do work + localStorage->m_mutex.lock(); + while ( IJob* job = jobContext->consumeJob() ) + { + localStorage->status = WorkerThreadStatus::kWorking; + job->executeJob( localStorage->threadId ); + localStorage->numJobsFinished++; + } + localStorage->status = WorkerThreadStatus::kWaitingForWork; + localStorage->m_mutex.unlock(); + unsigned long long int clockStart = jobContext->m_clock.getTimeMicroseconds(); + // while queue is empty, + while (jobContext->m_queueIsEmpty) + { + // todo: spin wait a bit to avoid hammering the empty queue + btSpinPause(); + if ( jobContext->m_workersShouldSleep ) + { + shouldSleep = true; + break; + } + // if jobs are incoming, + if (jobContext->m_workersShouldCheckQueue) + { + clockStart = jobContext->m_clock.getTimeMicroseconds(); // reset clock + } + else + { + // if no jobs incoming and queue has been empty for the cooldown time, sleep + unsigned long long int timeElapsed = jobContext->m_clock.getTimeMicroseconds() - clockStart; + if (timeElapsed > jobContext->m_coolDownTime) + { + shouldSleep = true; + break; + } + } + } + } + + // go idle + localStorage->m_mutex.lock(); + localStorage->status = WorkerThreadStatus::kSleeping; + localStorage->m_mutex.unlock(); +} + + +static void* WorkerThreadAllocFunc() +{ + return new WorkerThreadLocalStorage; +} + + + +class btTaskSchedulerDefault : public btITaskScheduler +{ + JobContext m_jobContext; + btThreadSupportInterface* m_threadSupport; + btAlignedObjectArray m_jobMem; + btAlignedObjectArray m_threadLocalMem; + btSpinMutex m_antiNestingLock; // prevent nested parallel-for + int m_numThreads; + int m_numWorkerThreads; + int m_maxNumThreads; + int m_numJobs; +public: + + btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport") + { + m_threadSupport = NULL; + } + + virtual ~btTaskSchedulerDefault() + { + shutdown(); + } + + void init() + { + btThreadSupportInterface::ConstructionInfo constructionInfo( "TaskScheduler", WorkerThreadFunc, WorkerThreadAllocFunc ); + m_threadSupport = btThreadSupportInterface::create( constructionInfo ); + + m_numWorkerThreads = m_threadSupport->getNumWorkerThreads(); + m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1; + m_numThreads = m_maxNumThreads; + m_jobContext.m_queueLock = m_threadSupport->createCriticalSection(); + for ( int i = 0; i < m_numWorkerThreads; i++ ) + { + WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i ); + btAssert( storage ); + storage->threadId = i + 1; // workers start at 1 + storage->status = WorkerThreadStatus::kSleeping; + } + setWorkersActive( false ); // no work for them yet + setNumThreads( m_threadSupport->getCacheFriendlyNumThreads() ); + } + + virtual void shutdown() + { + setWorkersActive( false ); + waitForWorkersToSleep(); + m_threadSupport->deleteCriticalSection( m_jobContext.m_queueLock ); + m_jobContext.m_queueLock = NULL; + + delete m_threadSupport; + m_threadSupport = NULL; + } + + void setWorkersActive( bool active ) + { + m_jobContext.m_workersShouldCheckQueue = active; + } + + virtual int getMaxNumThreads() const BT_OVERRIDE + { + return m_maxNumThreads; + } + + virtual int getNumThreads() const BT_OVERRIDE + { + return m_numThreads; + } + + virtual void setNumThreads( int numThreads ) BT_OVERRIDE + { + m_numThreads = btMax( btMin(numThreads, int(m_maxNumThreads)), 1 ); + m_numWorkerThreads = m_numThreads - 1; + } + + void waitJobs() + { + BT_PROFILE( "waitJobs" ); + // have the main thread work until the job queue is empty + int numMainThreadJobsFinished = 0; + while ( IJob* job = m_jobContext.consumeJob() ) + { + job->executeJob( 0 ); + numMainThreadJobsFinished++; + } + // done with jobs for now, tell workers to rest + setWorkersActive( false ); + + unsigned long long int clockStart = m_jobContext.m_clock.getTimeMicroseconds(); + // wait for workers to finish any jobs in progress + while ( true ) + { + int numWorkerJobsFinished = 0; + for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker ) + { + WorkerThreadLocalStorage* storage = static_cast( m_threadSupport->getThreadLocalMemory( iWorker ) ); + storage->m_mutex.lock(); + numWorkerJobsFinished += storage->numJobsFinished; + storage->m_mutex.unlock(); + } + if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs) + { + break; + } + unsigned long long int timeElapsed = m_jobContext.m_clock.getTimeMicroseconds() - clockStart; + btAssert(timeElapsed < 1000); + if (timeElapsed > 100000) + { + break; + } + btSpinPause(); + } + } + + void wakeWorkers(int numWorkersToWake) + { + BT_PROFILE( "wakeWorkers" ); + btAssert( m_jobContext.m_workersShouldCheckQueue ); + int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads); + int numActiveWorkers = 0; + for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker ) + { + // note this count of active workers is not necessarily totally reliable, because a worker thread could be + // just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare. + WorkerThreadLocalStorage* storage = static_cast( m_threadSupport->getThreadLocalMemory( iWorker ) ); + if (storage->status != WorkerThreadStatus::kSleeping) + { + numActiveWorkers++; + } + } + for ( int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker ) + { + WorkerThreadLocalStorage* storage = static_cast( m_threadSupport->getThreadLocalMemory( iWorker ) ); + if (storage->status == WorkerThreadStatus::kSleeping) + { + m_threadSupport->runTask( iWorker, &m_jobContext ); + numActiveWorkers++; + } + } + } + + void waitForWorkersToSleep() + { + BT_PROFILE( "waitForWorkersToSleep" ); + m_jobContext.m_workersShouldSleep = true; + m_threadSupport->waitForAllTasks(); + for ( int i = 0; i < m_numWorkerThreads; i++ ) + { + WorkerThreadLocalStorage* storage = static_cast( m_threadSupport->getThreadLocalMemory(i) ); + btAssert( storage ); + btAssert( storage->status == WorkerThreadStatus::kSleeping ); + } + } + + virtual void sleepWorkerThreadsHint() BT_OVERRIDE + { + BT_PROFILE( "sleepWorkerThreadsHint" ); + // hint the task scheduler that we may not be using these threads for a little while + m_jobContext.m_workersShouldSleep = true; + } + + void prepareWorkerThreads() + { + for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker ) + { + WorkerThreadLocalStorage* storage = static_cast( m_threadSupport->getThreadLocalMemory( iWorker ) ); + storage->m_mutex.lock(); + storage->numJobsFinished = 0; + storage->m_mutex.unlock(); + } + m_jobContext.m_workersShouldSleep = false; + setWorkersActive( true ); + } + + virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE + { + BT_PROFILE( "parallelFor_ThreadSupport" ); + btAssert( iEnd >= iBegin ); + btAssert( grainSize >= 1 ); + int iterationCount = iEnd - iBegin; + if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() ) + { + typedef ParallelForJob JobType; + int jobCount = ( iterationCount + grainSize - 1 ) / grainSize; + m_numJobs = jobCount; + btAssert( jobCount >= 2 ); // need more than one job for multithreading + int jobSize = sizeof( JobType ); + int jobBufSize = jobSize * jobCount; + // make sure we have enough memory allocated to store jobs + if ( jobBufSize > m_jobMem.size() ) + { + m_jobMem.resize( jobBufSize ); + } + // make sure job queue is big enough + if ( jobCount > m_jobContext.m_jobQueue.capacity() ) + { + m_jobContext.m_jobQueue.reserve( jobCount ); + } + + m_jobContext.clearQueue(); + // prepare worker threads for incoming work + prepareWorkerThreads(); + // submit all of the jobs + int iJob = 0; + JobType* jobs = reinterpret_cast( &m_jobMem[ 0 ] ); + for ( int i = iBegin; i < iEnd; i += grainSize ) + { + btAssert( iJob < jobCount ); + int iE = btMin( i + grainSize, iEnd ); + JobType& job = jobs[ iJob ]; + new ( (void*) &job ) ParallelForJob( i, iE, body ); // placement new + m_jobContext.submitJob( &job ); + iJob++; + } + wakeWorkers( jobCount - 1 ); + + // put the main thread to work on emptying the job queue and then wait for all workers to finish + waitJobs(); + m_antiNestingLock.unlock(); + } + else + { + BT_PROFILE( "parallelFor_mainThread" ); + // just run on main thread + body.forLoop( iBegin, iEnd ); + } + } + virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE + { + BT_PROFILE( "parallelSum_ThreadSupport" ); + btAssert( iEnd >= iBegin ); + btAssert( grainSize >= 1 ); + int iterationCount = iEnd - iBegin; + if ( iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock() ) + { + typedef ParallelSumJob JobType; + int jobCount = ( iterationCount + grainSize - 1 ) / grainSize; + m_numJobs = jobCount; + btAssert( jobCount >= 2 ); // need more than one job for multithreading + int jobSize = sizeof( JobType ); + int jobBufSize = jobSize * jobCount; + // make sure we have enough memory allocated to store jobs + if ( jobBufSize > m_jobMem.size() ) + { + m_jobMem.resize( jobBufSize ); + } + // make sure job queue is big enough + if ( jobCount > m_jobContext.m_jobQueue.capacity() ) + { + m_jobContext.m_jobQueue.reserve( jobCount ); + } + // make sure thread local area is big enough + int threadLocalSize = m_numThreads * sizeof( ThreadLocalSum ); + if ( threadLocalSize > m_threadLocalMem.size() ) + { + m_threadLocalMem.resize( threadLocalSize ); + } + // initialize summation + ThreadLocalSum* threadLocalSum = reinterpret_cast( &m_threadLocalMem[ 0 ] ); + for ( int iThread = 0; iThread < m_numThreads; ++iThread ) + { + threadLocalSum[ iThread ].mSum = btScalar( 0 ); + } + + m_jobContext.clearQueue(); + // prepare worker threads for incoming work + prepareWorkerThreads(); + // submit all of the jobs + int iJob = 0; + JobType* jobs = reinterpret_cast( &m_jobMem[ 0 ] ); + for ( int i = iBegin; i < iEnd; i += grainSize ) + { + btAssert( iJob < jobCount ); + int iE = btMin( i + grainSize, iEnd ); + JobType& job = jobs[ iJob ]; + new ( (void*) &job ) ParallelSumJob( i, iE, body, threadLocalSum ); // placement new + m_jobContext.submitJob( &job ); + iJob++; + } + wakeWorkers( jobCount - 1 ); + + // put the main thread to work on emptying the job queue and then wait for all workers to finish + waitJobs(); + m_antiNestingLock.unlock(); + + // add up all the thread sums + btScalar sum = btScalar(0); + for ( int iThread = 0; iThread < m_numThreads; ++iThread ) + { + sum += threadLocalSum[ iThread ].mSum; + } + return sum; + } + else + { + BT_PROFILE( "parallelSum_mainThread" ); + // just run on main thread + return body.sumLoop( iBegin, iEnd ); + } + } +}; + + + +btITaskScheduler* btCreateDefaultTaskScheduler() +{ + btTaskSchedulerDefault* ts = new btTaskSchedulerDefault(); + ts->init(); + return ts; +} + +#else // #if BT_THREADSAFE + +btITaskScheduler* btCreateDefaultTaskScheduler() +{ + return NULL; +} + +#endif // #else // #if BT_THREADSAFE \ No newline at end of file diff --git a/src/LinearMath/TaskScheduler/btThreadSupportInterface.h b/src/LinearMath/TaskScheduler/btThreadSupportInterface.h new file mode 100644 index 000000000..d537d7095 --- /dev/null +++ b/src/LinearMath/TaskScheduler/btThreadSupportInterface.h @@ -0,0 +1,75 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2018 Erwin Coumans http://bulletphysics.com + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef BT_THREAD_SUPPORT_INTERFACE_H +#define BT_THREAD_SUPPORT_INTERFACE_H + + + +class btCriticalSection +{ +public: + btCriticalSection() {} + virtual ~btCriticalSection() {} + + virtual void lock() = 0; + virtual void unlock() = 0; +}; + + +class btThreadSupportInterface +{ +public: + + virtual ~btThreadSupportInterface() {} + + virtual int getNumWorkerThreads() const = 0; // number of worker threads (total number of logical processors - 1) + virtual int getCacheFriendlyNumThreads() const = 0; // the number of logical processors sharing a single L3 cache + virtual void runTask( int threadIndex, void* userData ) = 0; + virtual void waitForAllTasks() = 0; + + virtual btCriticalSection* createCriticalSection() = 0; + virtual void deleteCriticalSection( btCriticalSection* criticalSection ) = 0; + + virtual void* getThreadLocalMemory( int taskId ) { return NULL; } + + typedef void( *ThreadFunc )( void* userPtr, void* lsMemory ); + typedef void* ( *MemorySetupFunc )( ); + + struct ConstructionInfo + { + ConstructionInfo( const char* uniqueName, + ThreadFunc userThreadFunc, + MemorySetupFunc lsMemoryFunc, + int threadStackSize = 65535 + ) + :m_uniqueName( uniqueName ), + m_userThreadFunc( userThreadFunc ), + m_lsMemoryFunc( lsMemoryFunc ), + m_threadStackSize( threadStackSize ) + { + } + + const char* m_uniqueName; + ThreadFunc m_userThreadFunc; + MemorySetupFunc m_lsMemoryFunc; + int m_threadStackSize; + }; + + static btThreadSupportInterface* create( const ConstructionInfo& info ); +}; + +#endif //BT_THREAD_SUPPORT_INTERFACE_H + diff --git a/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp b/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp new file mode 100644 index 000000000..5521fc555 --- /dev/null +++ b/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp @@ -0,0 +1,369 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2018 Erwin Coumans http://bulletphysics.com + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + + +#if BT_THREADSAFE && !defined( _WIN32 ) + + +#include "LinearMath/btScalar.h" +#include "LinearMath/btAlignedObjectArray.h" +#include "LinearMath/btThreads.h" +#include "LinearMath/btMinMax.h" +#include "btThreadSupportInterface.h" + +#include +#include +#include + + +#ifndef _XOPEN_SOURCE +#define _XOPEN_SOURCE 600 //for definition of pthread_barrier_t, see http://pages.cs.wisc.edu/~travitch/pthreads_primer.html +#endif //_XOPEN_SOURCE +#include +#include +#include //for sysconf + + +/// +/// getNumHardwareThreads() +/// +/// +/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine +/// +#if __cplusplus >= 201103L + +#include + +int btGetNumHardwareThreads() +{ + return std::thread::hardware_concurrency(); +} + +#else + +int btGetNumHardwareThreads() +{ + return sysconf( _SC_NPROCESSORS_ONLN ); +} + +#endif + + +// btThreadSupportPosix helps to initialize/shutdown libspe2, start/stop SPU tasks and communication +class btThreadSupportPosix : public btThreadSupportInterface +{ +public: + struct btThreadStatus + { + int m_taskId; + int m_commandId; + int m_status; + + ThreadFunc m_userThreadFunc; + void* m_userPtr; //for taskDesc etc + void* m_lsMemory; //initialized using PosixLocalStoreMemorySetupFunc + + pthread_t thread; + //each tread will wait until this signal to start its work + sem_t* startSemaphore; + + // this is a copy of m_mainSemaphore, + //each tread will signal once it is finished with its work + sem_t* m_mainSemaphore; + unsigned long threadUsed; + }; +private: + typedef unsigned long long UINT64; + + btAlignedObjectArray m_activeThreadStatus; + // m_mainSemaphoresemaphore will signal, if and how many threads are finished with their work + sem_t* m_mainSemaphore; + int m_numThreads; + UINT64 m_startedThreadsMask; + void startThreads( const ConstructionInfo& threadInfo ); + void stopThreads(); + int waitForResponse(); + +public: + btThreadSupportPosix( const ConstructionInfo& threadConstructionInfo ); + virtual ~btThreadSupportPosix(); + + virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; } + // TODO: return the number of logical processors sharing the first L3 cache + virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return m_numThreads + 1; } + + virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE; + virtual void waitForAllTasks() BT_OVERRIDE; + + virtual btCriticalSection* createCriticalSection() BT_OVERRIDE; + virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE; + + virtual void* getThreadLocalMemory( int taskId ) BT_OVERRIDE + { + return m_activeThreadStatus[ taskId ].m_lsMemory; + } +}; + + +#define checkPThreadFunction(returnValue) \ + if(0 != returnValue) { \ + printf("PThread problem at line %i in file %s: %i %d\n", __LINE__, __FILE__, returnValue, errno); \ + } + +// The number of threads should be equal to the number of available cores +// Todo: each worker should be linked to a single core, using SetThreadIdealProcessor. + + +btThreadSupportPosix::btThreadSupportPosix( const ConstructionInfo& threadConstructionInfo ) +{ + startThreads( threadConstructionInfo ); +} + +// cleanup/shutdown Libspe2 +btThreadSupportPosix::~btThreadSupportPosix() +{ + stopThreads(); +} + +#if (defined (__APPLE__)) +#define NAMED_SEMAPHORES +#endif + + +static sem_t* createSem( const char* baseName ) +{ + static int semCount = 0; +#ifdef NAMED_SEMAPHORES + /// Named semaphore begin + char name[ 32 ]; + snprintf( name, 32, "/%8.s-%4.d-%4.4d", baseName, getpid(), semCount++ ); + sem_t* tempSem = sem_open( name, O_CREAT, 0600, 0 ); + + if ( tempSem != reinterpret_cast( SEM_FAILED ) ) + { + // printf("Created \"%s\" Semaphore %p\n", name, tempSem); + } + else + { + //printf("Error creating Semaphore %d\n", errno); + exit( -1 ); + } + /// Named semaphore end +#else + sem_t* tempSem = new sem_t; + checkPThreadFunction( sem_init( tempSem, 0, 0 ) ); +#endif + return tempSem; +} + +static void destroySem( sem_t* semaphore ) +{ +#ifdef NAMED_SEMAPHORES + checkPThreadFunction( sem_close( semaphore ) ); +#else + checkPThreadFunction( sem_destroy( semaphore ) ); + delete semaphore; +#endif +} + +static void *threadFunction( void *argument ) +{ + btThreadSupportPosix::btThreadStatus* status = ( btThreadSupportPosix::btThreadStatus* )argument; + + while ( 1 ) + { + checkPThreadFunction( sem_wait( status->startSemaphore ) ); + void* userPtr = status->m_userPtr; + + if ( userPtr ) + { + btAssert( status->m_status ); + status->m_userThreadFunc( userPtr, status->m_lsMemory ); + status->m_status = 2; + checkPThreadFunction( sem_post( status->m_mainSemaphore ) ); + status->threadUsed++; + } + else + { + //exit Thread + status->m_status = 3; + checkPThreadFunction( sem_post( status->m_mainSemaphore ) ); + printf( "Thread with taskId %i exiting\n", status->m_taskId ); + break; + } + } + + printf( "Thread TERMINATED\n" ); +} + +///send messages to SPUs +void btThreadSupportPosix::runTask( int threadIndex, void* userData ) +{ + ///we should spawn an SPU task here, and in 'waitForResponse' it should wait for response of the (one of) the first tasks that finished + btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ]; + btAssert( threadIndex >= 0 ); + btAssert( threadIndex < m_activeThreadStatus.size() ); + + threadStatus.m_commandId = 1; + threadStatus.m_status = 1; + threadStatus.m_userPtr = userData; + m_startedThreadsMask |= UINT64( 1 ) << threadIndex; + + // fire event to start new task + checkPThreadFunction( sem_post( threadStatus.startSemaphore ) ); +} + + +///check for messages from SPUs +int btThreadSupportPosix::waitForResponse() +{ + ///We should wait for (one of) the first tasks to finish (or other SPU messages), and report its response + ///A possible response can be 'yes, SPU handled it', or 'no, please do a PPU fallback' + + btAssert( m_activeThreadStatus.size() ); + + // wait for any of the threads to finish + checkPThreadFunction( sem_wait( m_mainSemaphore ) ); + // get at least one thread which has finished + size_t last = -1; + + for ( size_t t = 0; t < size_t( m_activeThreadStatus.size() ); ++t ) + { + if ( 2 == m_activeThreadStatus[ t ].m_status ) + { + last = t; + break; + } + } + + btThreadStatus& threadStatus = m_activeThreadStatus[ last ]; + + btAssert( threadStatus.m_status > 1 ); + threadStatus.m_status = 0; + + // need to find an active spu + btAssert( last >= 0 ); + m_startedThreadsMask &= ~( UINT64( 1 ) << last ); + + return last; +} + + +void btThreadSupportPosix::waitForAllTasks() +{ + while ( m_startedThreadsMask ) + { + waitForResponse(); + } +} + + +void btThreadSupportPosix::startThreads( const ConstructionInfo& threadConstructionInfo ) +{ + m_numThreads = btGetNumHardwareThreads() - 1; // main thread exists already + printf( "%s creating %i threads.\n", __FUNCTION__, m_numThreads ); + m_activeThreadStatus.resize( m_numThreads ); + m_startedThreadsMask = 0; + + m_mainSemaphore = createSem( "main" ); + //checkPThreadFunction(sem_wait(mainSemaphore)); + + for ( int i = 0; i < m_numThreads; i++ ) + { + printf( "starting thread %d\n", i ); + btThreadStatus& threadStatus = m_activeThreadStatus[ i ]; + threadStatus.startSemaphore = createSem( "threadLocal" ); + checkPThreadFunction( pthread_create( &threadStatus.thread, NULL, &threadFunction, (void*) &threadStatus ) ); + + threadStatus.m_userPtr = 0; + threadStatus.m_taskId = i; + threadStatus.m_commandId = 0; + threadStatus.m_status = 0; + threadStatus.m_mainSemaphore = m_mainSemaphore; + threadStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc(); + threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc; + threadStatus.threadUsed = 0; + + printf( "started thread %d \n", i ); + } +} + +///tell the task scheduler we are done with the SPU tasks +void btThreadSupportPosix::stopThreads() +{ + for ( size_t t = 0; t < size_t( m_activeThreadStatus.size() ); ++t ) + { + btThreadStatus& threadStatus = m_activeThreadStatus[ t ]; + printf( "%s: Thread %i used: %ld\n", __FUNCTION__, int( t ), threadStatus.threadUsed ); + + threadStatus.m_userPtr = 0; + checkPThreadFunction( sem_post( threadStatus.startSemaphore ) ); + checkPThreadFunction( sem_wait( m_mainSemaphore ) ); + + printf( "destroy semaphore\n" ); + destroySem( threadStatus.startSemaphore ); + printf( "semaphore destroyed\n" ); + checkPThreadFunction( pthread_join( threadStatus.thread, 0 ) ); + + } + printf( "destroy main semaphore\n" ); + destroySem( m_mainSemaphore ); + printf( "main semaphore destroyed\n" ); + m_activeThreadStatus.clear(); +} + +class btCriticalSectionPosix : public btCriticalSection +{ + pthread_mutex_t m_mutex; + +public: + btCriticalSectionPosix() + { + pthread_mutex_init( &m_mutex, NULL ); + } + virtual ~btCriticalSectionPosix() + { + pthread_mutex_destroy( &m_mutex ); + } + + virtual void lock() + { + pthread_mutex_lock( &m_mutex ); + } + virtual void unlock() + { + pthread_mutex_unlock( &m_mutex ); + } +}; + + +btCriticalSection* btThreadSupportPosix::createCriticalSection() +{ + return new btCriticalSectionPosix(); +} + +void btThreadSupportPosix::deleteCriticalSection( btCriticalSection* cs ) +{ + delete cs; +} + + +btThreadSupportInterface* btThreadSupportInterface::create( const ConstructionInfo& info ) +{ + return new btThreadSupportPosix( info ); +} + +#endif // BT_THREADSAFE && !defined( _WIN32 ) + diff --git a/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp b/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp new file mode 100644 index 000000000..f77616337 --- /dev/null +++ b/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp @@ -0,0 +1,480 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2003-2018 Erwin Coumans http://bulletphysics.com + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ + +#if defined( _WIN32 ) && BT_THREADSAFE + +#include "LinearMath/btScalar.h" +#include "LinearMath/btMinMax.h" +#include "LinearMath/btAlignedObjectArray.h" +#include "LinearMath/btThreads.h" +#include "btThreadSupportInterface.h" +#include +#include + + +struct btProcessorInfo +{ + int numLogicalProcessors; + int numCores; + int numNumaNodes; + int numL1Cache; + int numL2Cache; + int numL3Cache; + int numPhysicalPackages; + static const int maxNumTeamMasks = 32; + int numTeamMasks; + UINT64 processorTeamMasks[ maxNumTeamMasks ]; +}; + +UINT64 getProcessorTeamMask( const btProcessorInfo& procInfo, int procId ) +{ + UINT64 procMask = UINT64( 1 ) << procId; + for ( int i = 0; i < procInfo.numTeamMasks; ++i ) + { + if ( procMask & procInfo.processorTeamMasks[ i ] ) + { + return procInfo.processorTeamMasks[ i ]; + } + } + return 0; +} + +int getProcessorTeamIndex( const btProcessorInfo& procInfo, int procId ) +{ + UINT64 procMask = UINT64( 1 ) << procId; + for ( int i = 0; i < procInfo.numTeamMasks; ++i ) + { + if ( procMask & procInfo.processorTeamMasks[ i ] ) + { + return i; + } + } + return -1; +} + +int countSetBits( ULONG64 bits ) +{ + int count = 0; + while ( bits ) + { + if ( bits & 1 ) + { + count++; + } + bits >>= 1; + } + return count; +} + + +typedef BOOL( WINAPI *Pfn_GetLogicalProcessorInformation )( PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD ); + + +void getProcessorInformation( btProcessorInfo* procInfo ) +{ + memset( procInfo, 0, sizeof( *procInfo ) ); + Pfn_GetLogicalProcessorInformation getLogicalProcInfo = + (Pfn_GetLogicalProcessorInformation) GetProcAddress( GetModuleHandle( TEXT( "kernel32" ) ), "GetLogicalProcessorInformation" ); + if ( getLogicalProcInfo == NULL ) + { + // no info + return; + } + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buf = NULL; + DWORD bufSize = 0; + while ( true ) + { + if ( getLogicalProcInfo( buf, &bufSize ) ) + { + break; + } + else + { + if ( GetLastError() == ERROR_INSUFFICIENT_BUFFER ) + { + if ( buf ) + { + free( buf ); + } + buf = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION) malloc( bufSize ); + } + } + } + + int len = bufSize / sizeof( *buf ); + for ( int i = 0; i < len; ++i ) + { + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION info = buf + i; + switch ( info->Relationship ) + { + case RelationNumaNode: + procInfo->numNumaNodes++; + break; + + case RelationProcessorCore: + procInfo->numCores++; + procInfo->numLogicalProcessors += countSetBits( info->ProcessorMask ); + break; + + case RelationCache: + if ( info->Cache.Level == 1 ) + { + procInfo->numL1Cache++; + } + else if ( info->Cache.Level == 2 ) + { + procInfo->numL2Cache++; + } + else if ( info->Cache.Level == 3 ) + { + procInfo->numL3Cache++; + // processors that share L3 cache are considered to be on the same team + // because they can more easily work together on the same data. + // Large performance penalties will occur if 2 or more threads from different + // teams attempt to frequently read and modify the same cache lines. + // + // On the AMD Ryzen 7 CPU for example, the 8 cores on the CPU are split into + // 2 CCX units of 4 cores each. Each CCX has a separate L3 cache, so if both + // CCXs are operating on the same data, many cycles will be spent keeping the + // two caches coherent. + if ( procInfo->numTeamMasks < btProcessorInfo::maxNumTeamMasks ) + { + procInfo->processorTeamMasks[ procInfo->numTeamMasks ] = info->ProcessorMask; + procInfo->numTeamMasks++; + } + } + break; + + case RelationProcessorPackage: + procInfo->numPhysicalPackages++; + break; + } + } + free( buf ); +} + + + +///btThreadSupportWin32 helps to initialize/shutdown libspe2, start/stop SPU tasks and communication +class btThreadSupportWin32 : public btThreadSupportInterface +{ +public: + struct btThreadStatus + { + int m_taskId; + int m_commandId; + int m_status; + + ThreadFunc m_userThreadFunc; + void* m_userPtr; //for taskDesc etc + void* m_lsMemory; //initialized using Win32LocalStoreMemorySetupFunc + + void* m_threadHandle; //this one is calling 'Win32ThreadFunc' + + void* m_eventStartHandle; + char m_eventStartHandleName[ 32 ]; + + void* m_eventCompleteHandle; + char m_eventCompleteHandleName[ 32 ]; + }; + +private: + btAlignedObjectArray m_activeThreadStatus; + btAlignedObjectArray m_completeHandles; + int m_numThreads; + DWORD_PTR m_startedThreadMask; + btProcessorInfo m_processorInfo; + + void startThreads( const ConstructionInfo& threadInfo ); + void stopThreads(); + int waitForResponse(); + +public: + + btThreadSupportWin32( const ConstructionInfo& threadConstructionInfo ); + virtual ~btThreadSupportWin32(); + + virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; } + virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return countSetBits(m_processorInfo.processorTeamMasks[0]); } + + virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE; + virtual void waitForAllTasks() BT_OVERRIDE; + + virtual void* getThreadLocalMemory( int taskId ) BT_OVERRIDE + { + return m_activeThreadStatus[ taskId ].m_lsMemory; + } + + virtual btCriticalSection* createCriticalSection() BT_OVERRIDE; + virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE; +}; + + +btThreadSupportWin32::btThreadSupportWin32( const ConstructionInfo & threadConstructionInfo ) +{ + startThreads( threadConstructionInfo ); +} + + +btThreadSupportWin32::~btThreadSupportWin32() +{ + stopThreads(); +} + + +DWORD WINAPI win32threadStartFunc( LPVOID lpParam ) +{ + btThreadSupportWin32::btThreadStatus* status = ( btThreadSupportWin32::btThreadStatus* )lpParam; + + while ( 1 ) + { + WaitForSingleObject( status->m_eventStartHandle, INFINITE ); + void* userPtr = status->m_userPtr; + + if ( userPtr ) + { + btAssert( status->m_status ); + status->m_userThreadFunc( userPtr, status->m_lsMemory ); + status->m_status = 2; + SetEvent( status->m_eventCompleteHandle ); + } + else + { + //exit Thread + status->m_status = 3; + printf( "Thread with taskId %i with handle %p exiting\n", status->m_taskId, status->m_threadHandle ); + SetEvent( status->m_eventCompleteHandle ); + break; + } + } + printf( "Thread TERMINATED\n" ); + return 0; +} + + +void btThreadSupportWin32::runTask( int threadIndex, void* userData ) +{ + btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ]; + btAssert( taskId >= 0 ); + btAssert( int( taskId ) < m_activeThreadStatus.size() ); + + threadStatus.m_commandId = 1; + threadStatus.m_status = 1; + threadStatus.m_userPtr = userData; + m_startedThreadMask |= DWORD_PTR( 1 ) << threadIndex; + + ///fire event to start new task + SetEvent( threadStatus.m_eventStartHandle ); +} + + +int btThreadSupportWin32::waitForResponse() +{ + btAssert( m_activeThreadStatus.size() ); + + int last = -1; + DWORD res = WaitForMultipleObjects( m_completeHandles.size(), &m_completeHandles[ 0 ], FALSE, INFINITE ); + btAssert( res != WAIT_FAILED ); + last = res - WAIT_OBJECT_0; + + btThreadStatus& threadStatus = m_activeThreadStatus[ last ]; + btAssert( threadStatus.m_threadHandle ); + btAssert( threadStatus.m_eventCompleteHandle ); + + //WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE); + btAssert( threadStatus.m_status > 1 ); + threadStatus.m_status = 0; + + ///need to find an active spu + btAssert( last >= 0 ); + m_startedThreadMask &= ~( DWORD_PTR( 1 ) << last ); + + return last; +} + + +void btThreadSupportWin32::waitForAllTasks() +{ + while ( m_startedThreadMask ) + { + waitForResponse(); + } +} + + +void btThreadSupportWin32::startThreads( const ConstructionInfo& threadConstructionInfo ) +{ + static int uniqueId = 0; + uniqueId++; + btProcessorInfo& procInfo = m_processorInfo; + getProcessorInformation( &procInfo ); + DWORD_PTR dwProcessAffinityMask = 0; + DWORD_PTR dwSystemAffinityMask = 0; + if ( !GetProcessAffinityMask( GetCurrentProcess(), &dwProcessAffinityMask, &dwSystemAffinityMask ) ) + { + dwProcessAffinityMask = 0; + } + ///The number of threads should be equal to the number of available cores - 1 + m_numThreads = btMin(procInfo.numLogicalProcessors, int(BT_MAX_THREAD_COUNT)) - 1; // cap to max thread count (-1 because main thread already exists) + + m_activeThreadStatus.resize( m_numThreads ); + m_completeHandles.resize( m_numThreads ); + m_startedThreadMask = 0; + + // set main thread affinity + if ( DWORD_PTR mask = dwProcessAffinityMask & getProcessorTeamMask( procInfo, 0 )) + { + SetThreadAffinityMask( GetCurrentThread(), mask ); + SetThreadIdealProcessor( GetCurrentThread(), 0 ); + } + + for ( int i = 0; i < m_numThreads; i++ ) + { + printf( "starting thread %d\n", i ); + + btThreadStatus& threadStatus = m_activeThreadStatus[ i ]; + + LPSECURITY_ATTRIBUTES lpThreadAttributes = NULL; + SIZE_T dwStackSize = threadConstructionInfo.m_threadStackSize; + LPTHREAD_START_ROUTINE lpStartAddress = &win32threadStartFunc; + LPVOID lpParameter = &threadStatus; + DWORD dwCreationFlags = 0; + LPDWORD lpThreadId = 0; + + threadStatus.m_userPtr = 0; + + sprintf( threadStatus.m_eventStartHandleName, "es%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i ); + threadStatus.m_eventStartHandle = CreateEventA( 0, false, false, threadStatus.m_eventStartHandleName ); + + sprintf( threadStatus.m_eventCompleteHandleName, "ec%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i ); + threadStatus.m_eventCompleteHandle = CreateEventA( 0, false, false, threadStatus.m_eventCompleteHandleName ); + + m_completeHandles[ i ] = threadStatus.m_eventCompleteHandle; + + HANDLE handle = CreateThread( lpThreadAttributes, dwStackSize, lpStartAddress, lpParameter, dwCreationFlags, lpThreadId ); + //SetThreadPriority( handle, THREAD_PRIORITY_HIGHEST ); + // highest priority -- can cause erratic performance when numThreads > numCores + // we don't want worker threads to be higher priority than the main thread or the main thread could get + // totally shut out and unable to tell the workers to stop + //SetThreadPriority( handle, THREAD_PRIORITY_BELOW_NORMAL ); + + { + int processorId = i + 1; // leave processor 0 for main thread + DWORD_PTR teamMask = getProcessorTeamMask( procInfo, processorId ); + if ( teamMask ) + { + // bind each thread to only execute on processors of it's assigned team + // - for single-socket Intel x86 CPUs this has no effect (only a single, shared L3 cache so there is only 1 team) + // - for multi-socket Intel this will keep threads from migrating from one socket to another + // - for AMD Ryzen this will keep threads from migrating from one CCX to another + DWORD_PTR mask = teamMask & dwProcessAffinityMask; + if ( mask ) + { + SetThreadAffinityMask( handle, mask ); + } + } + SetThreadIdealProcessor( handle, processorId ); + } + + threadStatus.m_taskId = i; + threadStatus.m_commandId = 0; + threadStatus.m_status = 0; + threadStatus.m_threadHandle = handle; + threadStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc(); + threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc; + + printf( "started %s thread %d with threadHandle %p\n", threadConstructionInfo.m_uniqueName, i, handle ); + } +} + +///tell the task scheduler we are done with the SPU tasks +void btThreadSupportWin32::stopThreads() +{ + for ( int i = 0; i < m_activeThreadStatus.size(); i++ ) + { + btThreadStatus& threadStatus = m_activeThreadStatus[ i ]; + if ( threadStatus.m_status > 0 ) + { + WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE ); + } + + delete threadStatus.m_lsMemory; + + threadStatus.m_userPtr = 0; + SetEvent( threadStatus.m_eventStartHandle ); + WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE ); + + CloseHandle( threadStatus.m_eventCompleteHandle ); + CloseHandle( threadStatus.m_eventStartHandle ); + CloseHandle( threadStatus.m_threadHandle ); + + } + + m_activeThreadStatus.clear(); + m_completeHandles.clear(); +} + + +class btWin32CriticalSection : public btCriticalSection +{ +private: + CRITICAL_SECTION mCriticalSection; + +public: + btWin32CriticalSection() + { + InitializeCriticalSection( &mCriticalSection ); + } + + ~btWin32CriticalSection() + { + DeleteCriticalSection( &mCriticalSection ); + } + + void lock() + { + EnterCriticalSection( &mCriticalSection ); + } + + void unlock() + { + LeaveCriticalSection( &mCriticalSection ); + } +}; + + +btCriticalSection* btThreadSupportWin32::createCriticalSection() +{ + unsigned char* mem = (unsigned char*) btAlignedAlloc( sizeof( btWin32CriticalSection ), 16 ); + btWin32CriticalSection* cs = new( mem ) btWin32CriticalSection(); + return cs; +} + +void btThreadSupportWin32::deleteCriticalSection( btCriticalSection* criticalSection ) +{ + criticalSection->~btCriticalSection(); + btAlignedFree( criticalSection ); +} + + +btThreadSupportInterface* btThreadSupportInterface::create( const ConstructionInfo& info ) +{ + return new btThreadSupportWin32( info ); +} + + + +#endif //defined(_WIN32) && BT_THREADSAFE + diff --git a/src/LinearMath/btThreads.cpp b/src/LinearMath/btThreads.cpp index 59a7ea36e..c037626ff 100644 --- a/src/LinearMath/btThreads.cpp +++ b/src/LinearMath/btThreads.cpp @@ -453,6 +453,33 @@ void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBod #endif// #if BT_THREADSAFE } +btScalar btParallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) +{ +#if BT_THREADSAFE + +#if BT_DETECT_BAD_THREAD_INDEX + if ( !btThreadsAreRunning() ) + { + // clear out thread ids + for ( int i = 0; i < BT_MAX_THREAD_COUNT; ++i ) + { + gDebugThreadIds[ i ] = kInvalidThreadId; + } + } +#endif // #if BT_DETECT_BAD_THREAD_INDEX + + btAssert( gBtTaskScheduler != NULL ); // call btSetTaskScheduler() with a valid task scheduler first! + return gBtTaskScheduler->parallelSum( iBegin, iEnd, grainSize, body ); + +#else // #if BT_THREADSAFE + + // non-parallel version of btParallelSum + btAssert( !"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE" ); + return body.sumLoop( iBegin, iEnd ); + +#endif //#else // #if BT_THREADSAFE +} + /// /// btTaskSchedulerSequential -- non-threaded implementation of task scheduler @@ -470,6 +497,11 @@ public: BT_PROFILE( "parallelFor_sequential" ); body.forLoop( iBegin, iEnd ); } + virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE + { + BT_PROFILE( "parallelSum_sequential" ); + return body.sumLoop( iBegin, iEnd ); + } }; @@ -514,11 +546,25 @@ public: #pragma omp parallel for schedule( static, 1 ) for ( int i = iBegin; i < iEnd; i += grainSize ) { - BT_PROFILE( "OpenMP_job" ); + BT_PROFILE( "OpenMP_forJob" ); body.forLoop( i, ( std::min )( i + grainSize, iEnd ) ); } btPopThreadsAreRunning(); } + virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE + { + BT_PROFILE( "parallelFor_OpenMP" ); + btPushThreadsAreRunning(); + btScalar sum = btScalar( 0 ); +#pragma omp parallel for schedule( static, 1 ) reduction(+:sum) + for ( int i = iBegin; i < iEnd; i += grainSize ) + { + BT_PROFILE( "OpenMP_sumJob" ); + sum += body.sumLoop( i, ( std::min )( i + grainSize, iEnd ) ); + } + btPopThreadsAreRunning(); + return sum; + } }; #endif // #if BT_USE_OPENMP && BT_THREADSAFE @@ -571,22 +617,21 @@ public: btResetThreadIndexCounter(); } } - struct BodyAdapter + struct ForBodyAdapter { const btIParallelForBody* mBody; + ForBodyAdapter( const btIParallelForBody* body ) : mBody( body ) {} void operator()( const tbb::blocked_range& range ) const { - BT_PROFILE( "TBB_job" ); + BT_PROFILE( "TBB_forJob" ); mBody->forLoop( range.begin(), range.end() ); } }; virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE { BT_PROFILE( "parallelFor_TBB" ); - // TBB dispatch - BodyAdapter tbbBody; - tbbBody.mBody = &body; + ForBodyAdapter tbbBody( &body ); btPushThreadsAreRunning(); tbb::parallel_for( tbb::blocked_range( iBegin, iEnd, grainSize ), tbbBody, @@ -594,6 +639,29 @@ public: ); btPopThreadsAreRunning(); } + struct SumBodyAdapter + { + const btIParallelSumBody* mBody; + btScalar mSum; + + SumBodyAdapter( const btIParallelSumBody* body ) : mBody( body ), mSum( btScalar( 0 ) ) {} + SumBodyAdapter( const SumBodyAdapter& src, tbb::split ) : mBody( src.mBody ), mSum( btScalar( 0 ) ) {} + void join( const SumBodyAdapter& src ) { mSum += src.mSum; } + void operator()( const tbb::blocked_range& range ) + { + BT_PROFILE( "TBB_sumJob" ); + mSum += mBody->sumLoop( range.begin(), range.end() ); + } + }; + virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE + { + BT_PROFILE( "parallelSum_TBB" ); + SumBodyAdapter tbbBody( &body ); + btPushThreadsAreRunning(); + tbb::parallel_deterministic_reduce( tbb::blocked_range( iBegin, iEnd, grainSize ), tbbBody ); + btPopThreadsAreRunning(); + return tbbBody.mSum; + } }; #endif // #if BT_USE_TBB && BT_THREADSAFE @@ -605,6 +673,7 @@ public: class btTaskSchedulerPPL : public btITaskScheduler { int m_numThreads; + concurrency::combinable m_sum; // for parallelSum public: btTaskSchedulerPPL() : btITaskScheduler( "PPL" ) { @@ -644,15 +713,16 @@ public: btResetThreadIndexCounter(); } } - struct BodyAdapter + struct ForBodyAdapter { const btIParallelForBody* mBody; int mGrainSize; int mIndexEnd; + ForBodyAdapter( const btIParallelForBody* body, int grainSize, int end ) : mBody( body ), mGrainSize( grainSize ), mIndexEnd( end ) {} void operator()( int i ) const { - BT_PROFILE( "PPL_job" ); + BT_PROFILE( "PPL_forJob" ); mBody->forLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) ); } }; @@ -660,10 +730,7 @@ public: { BT_PROFILE( "parallelFor_PPL" ); // PPL dispatch - BodyAdapter pplBody; - pplBody.mBody = &body; - pplBody.mGrainSize = grainSize; - pplBody.mIndexEnd = iEnd; + ForBodyAdapter pplBody( &body, grainSize, iEnd ); btPushThreadsAreRunning(); // note: MSVC 2010 doesn't support partitioner args, so avoid them concurrency::parallel_for( iBegin, @@ -673,6 +740,36 @@ public: ); btPopThreadsAreRunning(); } + struct SumBodyAdapter + { + const btIParallelSumBody* mBody; + concurrency::combinable* mSum; + int mGrainSize; + int mIndexEnd; + + SumBodyAdapter( const btIParallelSumBody* body, concurrency::combinable* sum, int grainSize, int end ) : mBody( body ), mSum(sum), mGrainSize( grainSize ), mIndexEnd( end ) {} + void operator()( int i ) const + { + BT_PROFILE( "PPL_sumJob" ); + mSum->local() += mBody->sumLoop( i, ( std::min )( i + mGrainSize, mIndexEnd ) ); + } + }; + static btScalar sumFunc( btScalar a, btScalar b ) { return a + b; } + virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) BT_OVERRIDE + { + BT_PROFILE( "parallelSum_PPL" ); + m_sum.clear(); + SumBodyAdapter pplBody( &body, &m_sum, grainSize, iEnd ); + btPushThreadsAreRunning(); + // note: MSVC 2010 doesn't support partitioner args, so avoid them + concurrency::parallel_for( iBegin, + iEnd, + grainSize, + pplBody + ); + btPopThreadsAreRunning(); + return m_sum.combine( sumFunc ); + } }; #endif // #if BT_USE_PPL && BT_THREADSAFE diff --git a/src/LinearMath/btThreads.h b/src/LinearMath/btThreads.h index cef542329..ecd5a19cf 100644 --- a/src/LinearMath/btThreads.h +++ b/src/LinearMath/btThreads.h @@ -107,6 +107,17 @@ public: virtual void forLoop( int iBegin, int iEnd ) const = 0; }; +// +// btIParallelSumBody -- subclass this to express work that can be done in parallel +// and produces a sum over all loop elements +// +class btIParallelSumBody +{ +public: + virtual ~btIParallelSumBody() {} + virtual btScalar sumLoop( int iBegin, int iEnd ) const = 0; +}; + // // btITaskScheduler -- subclass this to implement a task scheduler that can dispatch work to // worker threads @@ -122,6 +133,8 @@ public: virtual int getNumThreads() const = 0; virtual void setNumThreads( int numThreads ) = 0; virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) = 0; + virtual btScalar parallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ) = 0; + virtual void sleepWorkerThreadsHint() {} // hint the task scheduler that we may not be using these threads for a little while // internal use only virtual void activate(); @@ -143,6 +156,9 @@ btITaskScheduler* btGetTaskScheduler(); // get non-threaded task scheduler (always available) btITaskScheduler* btGetSequentialTaskScheduler(); +// create a default task scheduler (Win32 or pthreads based) +btITaskScheduler* btCreateDefaultTaskScheduler(); + // get OpenMP task scheduler (if available, otherwise returns null) btITaskScheduler* btGetOpenMPTaskScheduler(); @@ -156,5 +172,9 @@ btITaskScheduler* btGetPPLTaskScheduler(); // (iterations may be done out of order, so no dependencies are allowed) void btParallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ); +// btParallelSum -- call this to dispatch work like a for-loop, returns the sum of all iterations +// (iterations may be done out of order, so no dependencies are allowed) +btScalar btParallelSum( int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body ); + #endif From 06690e542b22b08c2c120d110ba8225211afb5ce Mon Sep 17 00:00:00 2001 From: Lunkhound Date: Tue, 27 Feb 2018 00:30:45 -0800 Subject: [PATCH 2/8] fix compile error for GCC --- src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp index bc840e889..a27350bf9 100644 --- a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp +++ b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp @@ -1113,7 +1113,7 @@ void btBatchedConstraints::setup( { if (constraints->size() >= minBatchSize*4) { - bool use2DGrid = batchingMethod == BatchingMethod::BATCHING_METHOD_SPATIAL_GRID_2D; + bool use2DGrid = batchingMethod == BATCHING_METHOD_SPATIAL_GRID_2D; setupSpatialGridBatchesMt( this, scratchMemory, constraints, bodies, minBatchSize, maxBatchSize, use2DGrid ); if (s_debugDrawBatches) { From d900a749392465e6bbadfdaa98635abcf30b30f4 Mon Sep 17 00:00:00 2001 From: Lunkhound Date: Tue, 27 Feb 2018 03:03:12 -0800 Subject: [PATCH 3/8] add new source files to setup.py to fix travis-ci build --- setup.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 664059460..bdf38bf27 100644 --- a/setup.py +++ b/setup.py @@ -134,6 +134,9 @@ sources = ["examples/pybullet/pybullet.c"]\ +["src/LinearMath/btConvexHullComputer.cpp"]\ +["src/LinearMath/btQuickprof.cpp"]\ +["src/LinearMath/btThreads.cpp"]\ ++["src/LinearMath/TaskScheduler/btTaskScheduler.cpp"]\ ++["src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp"]\ ++["src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp"]\ +["src/BulletCollision/BroadphaseCollision/btAxisSweep3.cpp"]\ +["src/BulletCollision/BroadphaseCollision/btDbvt.cpp"]\ +["src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp"]\ @@ -233,6 +236,7 @@ sources = ["examples/pybullet/pybullet.c"]\ +["src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp"]\ +["src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp"]\ +["src/BulletDynamics/Dynamics/btSimpleDynamicsWorld.cpp"]\ ++["src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp"]\ +["src/BulletDynamics/ConstraintSolver/btConeTwistConstraint.cpp"]\ +["src/BulletDynamics/ConstraintSolver/btGeneric6DofSpringConstraint.cpp"]\ +["src/BulletDynamics/ConstraintSolver/btSliderConstraint.cpp"]\ @@ -249,6 +253,7 @@ sources = ["examples/pybullet/pybullet.c"]\ +["src/BulletDynamics/ConstraintSolver/btPoint2PointConstraint.cpp"]\ +["src/BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.cpp"]\ +["src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.cpp"]\ ++["src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp"]\ +["src/BulletDynamics/MLCPSolvers/btDantzigLCP.cpp"]\ +["src/BulletDynamics/MLCPSolvers/btLemkeAlgorithm.cpp"]\ +["src/BulletDynamics/MLCPSolvers/btMLCPSolver.cpp"]\ @@ -479,4 +484,3 @@ setup( packages=[x for x in find_packages('examples/pybullet/gym')], package_data = { 'pybullet_data': need_files } ) - From 45fd4acf6eb69389f9a600903a7b7bf8e813815f Mon Sep 17 00:00:00 2001 From: Lunkhound Date: Mon, 5 Mar 2018 20:05:38 -0800 Subject: [PATCH 4/8] dynamicsWorldMt: make island mgr aware of whether a parallel solver is present and make handoff from parallel solver to solver-pool more explicit --- .../CommonRigidBodyMTBase.cpp | 28 ++++- .../Dynamics/btDiscreteDynamicsWorldMt.cpp | 88 +++----------- .../Dynamics/btDiscreteDynamicsWorldMt.h | 4 +- .../Dynamics/btSimulationIslandManagerMt.cpp | 110 ++++++++++-------- .../Dynamics/btSimulationIslandManagerMt.h | 36 +++--- 5 files changed, 116 insertions(+), 150 deletions(-) diff --git a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp index b11cd7691..f9e0c209a 100644 --- a/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp +++ b/examples/MultiThreadedDemo/CommonRigidBodyMTBase.cpp @@ -200,11 +200,11 @@ public: /// /// myParallelIslandDispatch -- wrap default parallel dispatch for profiling and to get the number of simulation islands // -void myParallelIslandDispatch( btAlignedObjectArray* islandsPtr, btSimulationIslandManagerMt::IslandCallback* callback ) +void myParallelIslandDispatch( btAlignedObjectArray* islandsPtr, const btSimulationIslandManagerMt::SolverParams& solverParams) { ProfileHelper prof( Profiler::kRecordDispatchIslands ); gNumIslands = islandsPtr->size(); - btSimulationIslandManagerMt::parallelIslandDispatch( islandsPtr, callback ); + btSimulationIslandManagerMt::parallelIslandDispatch( islandsPtr, solverParams ); } @@ -239,9 +239,10 @@ public: MyDiscreteDynamicsWorld( btDispatcher* dispatcher, btBroadphaseInterface* pairCache, btConstraintSolverPoolMt* constraintSolver, + btSequentialImpulseConstraintSolverMt* constraintSolverMt, btCollisionConfiguration* collisionConfiguration ) : - btDiscreteDynamicsWorldMt( dispatcher, pairCache, constraintSolver, collisionConfiguration ) + btDiscreteDynamicsWorldMt( dispatcher, pairCache, constraintSolver, constraintSolverMt, collisionConfiguration ) { btSimulationIslandManagerMt* islandMgr = static_cast( m_islandManager ); islandMgr->setIslandDispatchFunction( myParallelIslandDispatch ); @@ -347,11 +348,12 @@ static btTaskSchedulerManager gTaskSchedulerMgr; #if BT_THREADSAFE static bool gMultithreadedWorld = true; static bool gDisplayProfileInfo = true; +static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT; #else static bool gMultithreadedWorld = false; static bool gDisplayProfileInfo = false; +static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE; #endif -static SolverType gSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT; static int gSolverMode = SOLVER_SIMD | SOLVER_USE_WARMSTARTING | // SOLVER_RANDMIZE_ORDER | @@ -547,16 +549,28 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld() btConstraintSolverPoolMt* solverPool; { + SolverType poolSolverType = m_solverType; + if (poolSolverType == SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT) + { + // pool solvers shouldn't be parallel solvers, we don't allow that kind of + // nested parallelism because of performance issues + poolSolverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE; + } btConstraintSolver* solvers[ BT_MAX_THREAD_COUNT ]; int maxThreadCount = BT_MAX_THREAD_COUNT; for ( int i = 0; i < maxThreadCount; ++i ) { - solvers[ i ] = createSolverByType( m_solverType ); + solvers[ i ] = createSolverByType( poolSolverType ); } solverPool = new btConstraintSolverPoolMt( solvers, maxThreadCount ); m_solver = solverPool; } - btDiscreteDynamicsWorld* world = new MyDiscreteDynamicsWorld( m_dispatcher, m_broadphase, solverPool, m_collisionConfiguration ); + btSequentialImpulseConstraintSolverMt* solverMt = NULL; + if ( m_solverType == SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT ) + { + solverMt = new MySequentialImpulseConstraintSolverMt(); + } + btDiscreteDynamicsWorld* world = new MyDiscreteDynamicsWorld( m_dispatcher, m_broadphase, solverPool, solverMt, m_collisionConfiguration ); m_dynamicsWorld = world; m_multithreadedWorld = true; btAssert( btGetTaskScheduler() != NULL ); @@ -579,6 +593,8 @@ void CommonRigidBodyMTBase::createEmptyDynamicsWorld() SolverType solverType = m_solverType; if ( solverType == SOLVER_TYPE_SEQUENTIAL_IMPULSE_MT ) { + // using the parallel solver with the single-threaded world works, but is + // disabled here to avoid confusion solverType = SOLVER_TYPE_SEQUENTIAL_IMPULSE; } m_solver = createSolverByType( solverType ); diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp index 330bccb87..d705bf238 100644 --- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp +++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.cpp @@ -50,63 +50,6 @@ subject to the following restrictions: #include "LinearMath/btSerializer.h" -struct InplaceSolverIslandCallbackMt : public btSimulationIslandManagerMt::IslandCallback -{ - btContactSolverInfo* m_solverInfo; - btConstraintSolver* m_solver; - btIDebugDraw* m_debugDrawer; - btDispatcher* m_dispatcher; - - InplaceSolverIslandCallbackMt( - btConstraintSolver* solver, - btStackAlloc* stackAlloc, - btDispatcher* dispatcher) - :m_solverInfo(NULL), - m_solver(solver), - m_debugDrawer(NULL), - m_dispatcher(dispatcher) - { - - } - - InplaceSolverIslandCallbackMt& operator=(InplaceSolverIslandCallbackMt& other) - { - btAssert(0); - (void)other; - return *this; - } - - SIMD_FORCE_INLINE void setup ( btContactSolverInfo* solverInfo, btIDebugDraw* debugDrawer) - { - btAssert(solverInfo); - m_solverInfo = solverInfo; - m_debugDrawer = debugDrawer; - } - - - virtual void processIsland( btCollisionObject** bodies, - int numBodies, - btPersistentManifold** manifolds, - int numManifolds, - btTypedConstraint** constraints, - int numConstraints, - int islandId - ) - { - m_solver->solveGroup( bodies, - numBodies, - manifolds, - numManifolds, - constraints, - numConstraints, - *m_solverInfo, - m_debugDrawer, - m_dispatcher - ); - } - -}; - /// /// btConstraintSolverPoolMt @@ -209,7 +152,12 @@ void btConstraintSolverPoolMt::reset() /// btDiscreteDynamicsWorldMt /// -btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher, btBroadphaseInterface* pairCache, btConstraintSolverPoolMt* constraintSolver, btCollisionConfiguration* collisionConfiguration) +btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher, + btBroadphaseInterface* pairCache, + btConstraintSolverPoolMt* constraintSolver, + btConstraintSolver* constraintSolverMt, + btCollisionConfiguration* collisionConfiguration +) : btDiscreteDynamicsWorld(dispatcher,pairCache,constraintSolver,collisionConfiguration) { if (m_ownsIslandManager) @@ -217,31 +165,18 @@ btDiscreteDynamicsWorldMt::btDiscreteDynamicsWorldMt(btDispatcher* dispatcher, b m_islandManager->~btSimulationIslandManager(); btAlignedFree( m_islandManager); } - { - void* mem = btAlignedAlloc(sizeof(InplaceSolverIslandCallbackMt),16); - m_solverIslandCallbackMt = new (mem) InplaceSolverIslandCallbackMt (m_constraintSolver, 0, dispatcher); - } { void* mem = btAlignedAlloc(sizeof(btSimulationIslandManagerMt),16); btSimulationIslandManagerMt* im = new (mem) btSimulationIslandManagerMt(); im->setMinimumSolverBatchSize( m_solverInfo.m_minimumSolverBatchSize ); m_islandManager = im; } + m_constraintSolverMt = constraintSolverMt; } btDiscreteDynamicsWorldMt::~btDiscreteDynamicsWorldMt() { - if (m_solverIslandCallbackMt) - { - m_solverIslandCallbackMt->~InplaceSolverIslandCallbackMt(); - btAlignedFree(m_solverIslandCallbackMt); - } - if (m_ownsConstraintSolver) - { - m_constraintSolver->~btConstraintSolver(); - btAlignedFree(m_constraintSolver); - } } @@ -249,12 +184,17 @@ void btDiscreteDynamicsWorldMt::solveConstraints(btContactSolverInfo& solverInfo { BT_PROFILE("solveConstraints"); - m_solverIslandCallbackMt->setup(&solverInfo, getDebugDrawer()); m_constraintSolver->prepareSolve(getCollisionWorld()->getNumCollisionObjects(), getCollisionWorld()->getDispatcher()->getNumManifolds()); /// solve all the constraints for this island btSimulationIslandManagerMt* im = static_cast(m_islandManager); - im->buildAndProcessIslands( getCollisionWorld()->getDispatcher(), getCollisionWorld(), m_constraints, m_solverIslandCallbackMt ); + btSimulationIslandManagerMt::SolverParams solverParams; + solverParams.m_solverPool = m_constraintSolver; + solverParams.m_solverMt = m_constraintSolverMt; + solverParams.m_solverInfo = &solverInfo; + solverParams.m_debugDrawer = m_debugDrawer; + solverParams.m_dispatcher = getCollisionWorld()->getDispatcher(); + im->buildAndProcessIslands( getCollisionWorld()->getDispatcher(), getCollisionWorld(), m_constraints, solverParams ); m_constraintSolver->allSolved(solverInfo, m_debugDrawer); } diff --git a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h index a8cc22dd0..667fe5800 100644 --- a/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h +++ b/src/BulletDynamics/Dynamics/btDiscreteDynamicsWorldMt.h @@ -21,7 +21,6 @@ subject to the following restrictions: #include "btSimulationIslandManagerMt.h" #include "BulletDynamics/ConstraintSolver/btConstraintSolver.h" -struct InplaceSolverIslandCallbackMt; /// /// btConstraintSolverPoolMt - masquerades as a constraint solver, but really it is a threadsafe pool of them. @@ -88,7 +87,7 @@ private: ATTRIBUTE_ALIGNED16(class) btDiscreteDynamicsWorldMt : public btDiscreteDynamicsWorld { protected: - InplaceSolverIslandCallbackMt* m_solverIslandCallbackMt; + btConstraintSolver* m_constraintSolverMt; virtual void solveConstraints(btContactSolverInfo& solverInfo) BT_OVERRIDE; @@ -126,6 +125,7 @@ public: btDiscreteDynamicsWorldMt(btDispatcher* dispatcher, btBroadphaseInterface* pairCache, btConstraintSolverPoolMt* constraintSolver, // Note this should be a solver-pool for multi-threading + btConstraintSolver* constraintSolverMt, // single multi-threaded solver for large islands (or NULL) btCollisionConfiguration* collisionConfiguration ); virtual ~btDiscreteDynamicsWorldMt(); diff --git a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp index 54ac39aaf..fc54f0ba6 100644 --- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp +++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.cpp @@ -276,7 +276,7 @@ btSimulationIslandManagerMt::Island* btSimulationIslandManagerMt::allocateIsland void btSimulationIslandManagerMt::buildIslands( btDispatcher* dispatcher, btCollisionWorld* collisionWorld ) { - BT_PROFILE("islandUnionFindAndQuickSort"); + BT_PROFILE("buildIslands"); btCollisionObjectArray& collisionObjects = collisionWorld->getCollisionObjectArray(); @@ -545,53 +545,58 @@ void btSimulationIslandManagerMt::mergeIslands() } -void btSimulationIslandManagerMt::serialIslandDispatch( btAlignedObjectArray* islandsPtr, IslandCallback* callback ) +void btSimulationIslandManagerMt::solveIsland(btConstraintSolver* solver, Island& island, const SolverParams& solverParams) +{ + btPersistentManifold** manifolds = island.manifoldArray.size() ? &island.manifoldArray[ 0 ] : NULL; + btTypedConstraint** constraintsPtr = island.constraintArray.size() ? &island.constraintArray[ 0 ] : NULL; + solver->solveGroup( &island.bodyArray[ 0 ], + island.bodyArray.size(), + manifolds, + island.manifoldArray.size(), + constraintsPtr, + island.constraintArray.size(), + *solverParams.m_solverInfo, + solverParams.m_debugDrawer, + solverParams.m_dispatcher + ); +} + + +void btSimulationIslandManagerMt::serialIslandDispatch( btAlignedObjectArray* islandsPtr, const SolverParams& solverParams ) { BT_PROFILE( "serialIslandDispatch" ); // serial dispatch btAlignedObjectArray& islands = *islandsPtr; + btConstraintSolver* solver = solverParams.m_solverMt ? solverParams.m_solverMt : solverParams.m_solverPool; for ( int i = 0; i < islands.size(); ++i ) { - Island* island = islands[ i ]; - btPersistentManifold** manifolds = island->manifoldArray.size() ? &island->manifoldArray[ 0 ] : NULL; - btTypedConstraint** constraintsPtr = island->constraintArray.size() ? &island->constraintArray[ 0 ] : NULL; - callback->processIsland( &island->bodyArray[ 0 ], - island->bodyArray.size(), - manifolds, - island->manifoldArray.size(), - constraintsPtr, - island->constraintArray.size(), - island->id - ); + solveIsland(solver, *islands[ i ], solverParams); } } + struct UpdateIslandDispatcher : public btIParallelForBody { - btAlignedObjectArray* islandsPtr; - btSimulationIslandManagerMt::IslandCallback* callback; + btAlignedObjectArray& m_islandsPtr; + const btSimulationIslandManagerMt::SolverParams& m_solverParams; + + UpdateIslandDispatcher(btAlignedObjectArray& islandsPtr, const btSimulationIslandManagerMt::SolverParams& solverParams) + : m_islandsPtr(islandsPtr), m_solverParams(solverParams) + {} void forLoop( int iBegin, int iEnd ) const BT_OVERRIDE { + btConstraintSolver* solver = m_solverParams.m_solverPool; for ( int i = iBegin; i < iEnd; ++i ) { - btSimulationIslandManagerMt::Island* island = ( *islandsPtr )[ i ]; - btPersistentManifold** manifolds = island->manifoldArray.size() ? &island->manifoldArray[ 0 ] : NULL; - btTypedConstraint** constraintsPtr = island->constraintArray.size() ? &island->constraintArray[ 0 ] : NULL; - callback->processIsland( &island->bodyArray[ 0 ], - island->bodyArray.size(), - manifolds, - island->manifoldArray.size(), - constraintsPtr, - island->constraintArray.size(), - island->id - ); + btSimulationIslandManagerMt::Island* island = m_islandsPtr[ i ]; + btSimulationIslandManagerMt::solveIsland( solver, *island, m_solverParams ); } } }; -void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray* islandsPtr, IslandCallback* callback ) +void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray* islandsPtr, const SolverParams& solverParams ) { BT_PROFILE( "parallelIslandDispatch" ); // @@ -617,24 +622,25 @@ void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArraysize()) + if (solverParams.m_solverMt) { - btSimulationIslandManagerMt::Island* island = (*islandsPtr)[ iBegin ]; - if (island->manifoldArray.size() < btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching) + while ( iBegin < islandsPtr->size() ) { - // OK to submit the rest of the array in parallel - break; + btSimulationIslandManagerMt::Island* island = ( *islandsPtr )[ iBegin ]; + if ( island->manifoldArray.size() < btSequentialImpulseConstraintSolverMt::s_minimumContactManifoldsForBatching ) + { + // OK to submit the rest of the array in parallel + break; + } + // serial dispatch to parallel solver for large islands (if any) + solveIsland(solverParams.m_solverMt, *island, solverParams); + ++iBegin; } - ++iBegin; } - // serial dispatch for large islands (if any) - dispatcher.forLoop(0, iBegin); - // parallel dispatch for rest + // parallel dispatch to sequential solvers for rest btParallelFor( iBegin, islandsPtr->size(), 1, dispatcher ); } @@ -643,15 +649,14 @@ void btSimulationIslandManagerMt::parallelIslandDispatch( btAlignedObjectArray& constraints, - IslandCallback* callback + const SolverParams& solverParams ) { + BT_PROFILE("buildAndProcessIslands"); btCollisionObjectArray& collisionObjects = collisionWorld->getCollisionObjectArray(); buildIslands(dispatcher,collisionWorld); - BT_PROFILE("processIslands"); - if(!getSplitIslands()) { btPersistentManifold** manifolds = dispatcher->getInternalManifoldPointer(); @@ -683,14 +688,17 @@ void btSimulationIslandManagerMt::buildAndProcessIslands( btDispatcher* dispatch } } btTypedConstraint** constraintsPtr = constraints.size() ? &constraints[ 0 ] : NULL; - callback->processIsland(&collisionObjects[0], - collisionObjects.size(), - manifolds, - maxNumManifolds, - constraintsPtr, - constraints.size(), - -1 - ); + btConstraintSolver* solver = solverParams.m_solverMt ? solverParams.m_solverMt : solverParams.m_solverPool; + solver->solveGroup(&collisionObjects[0], + collisionObjects.size(), + manifolds, + maxNumManifolds, + constraintsPtr, + constraints.size(), + *solverParams.m_solverInfo, + solverParams.m_debugDrawer, + solverParams.m_dispatcher + ); } else { @@ -710,6 +718,6 @@ void btSimulationIslandManagerMt::buildAndProcessIslands( btDispatcher* dispatch mergeIslands(); } // dispatch islands to solver - m_islandDispatch( &m_activeIslands, callback ); + m_islandDispatch( &m_activeIslands, solverParams ); } } diff --git a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h index 31a2053b4..563577a6f 100644 --- a/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h +++ b/src/BulletDynamics/Dynamics/btSimulationIslandManagerMt.h @@ -19,7 +19,9 @@ subject to the following restrictions: #include "BulletCollision/CollisionDispatch/btSimulationIslandManager.h" class btTypedConstraint; - +class btConstraintSolver; +struct btContactSolverInfo; +class btIDebugDraw; /// /// SimulationIslandManagerMt -- Multithread capable version of SimulationIslandManager @@ -45,22 +47,19 @@ public: void append( const Island& other ); // add bodies, manifolds, constraints to my own }; - struct IslandCallback + struct SolverParams { - virtual ~IslandCallback() {}; - - virtual void processIsland( btCollisionObject** bodies, - int numBodies, - btPersistentManifold** manifolds, - int numManifolds, - btTypedConstraint** constraints, - int numConstraints, - int islandId - ) = 0; + btConstraintSolver* m_solverPool; + btConstraintSolver* m_solverMt; + btContactSolverInfo* m_solverInfo; + btIDebugDraw* m_debugDrawer; + btDispatcher* m_dispatcher; }; - typedef void( *IslandDispatchFunc ) ( btAlignedObjectArray* islands, IslandCallback* callback ); - static void serialIslandDispatch( btAlignedObjectArray* islandsPtr, IslandCallback* callback ); - static void parallelIslandDispatch( btAlignedObjectArray* islandsPtr, IslandCallback* callback ); + static void solveIsland(btConstraintSolver* solver, Island& island, const SolverParams& solverParams); + + typedef void( *IslandDispatchFunc ) ( btAlignedObjectArray* islands, const SolverParams& solverParams ); + static void serialIslandDispatch( btAlignedObjectArray* islandsPtr, const SolverParams& solverParams ); + static void parallelIslandDispatch( btAlignedObjectArray* islandsPtr, const SolverParams& solverParams ); protected: btAlignedObjectArray m_allocatedIslands; // owner of all Islands btAlignedObjectArray m_activeIslands; // islands actively in use @@ -83,7 +82,11 @@ public: btSimulationIslandManagerMt(); virtual ~btSimulationIslandManagerMt(); - virtual void buildAndProcessIslands( btDispatcher* dispatcher, btCollisionWorld* collisionWorld, btAlignedObjectArray& constraints, IslandCallback* callback ); + virtual void buildAndProcessIslands( btDispatcher* dispatcher, + btCollisionWorld* collisionWorld, + btAlignedObjectArray& constraints, + const SolverParams& solverParams + ); virtual void buildIslands(btDispatcher* dispatcher,btCollisionWorld* colWorld); @@ -106,7 +109,6 @@ public: } }; -extern int gLargeIslandManifoldCount; #endif //BT_SIMULATION_ISLAND_MANAGER_H From eec478709afa8220da9ddbb8888f4fd64cf90e8e Mon Sep 17 00:00:00 2001 From: Lunkhound Date: Tue, 6 Mar 2018 02:28:23 -0800 Subject: [PATCH 5/8] parallel solver: small tweaks and fixes --- .../ConstraintSolver/btBatchedConstraints.cpp | 24 +++++-------- .../btSequentialImpulseConstraintSolverMt.cpp | 2 +- .../TaskScheduler/btTaskScheduler.cpp | 34 ++----------------- .../TaskScheduler/btThreadSupportWin32.cpp | 4 +-- 4 files changed, 14 insertions(+), 50 deletions(-) diff --git a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp index a27350bf9..310601659 100644 --- a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp +++ b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp @@ -38,11 +38,10 @@ struct btBatchedConstraintInfo struct btBatchInfo { - int phaseId; int numConstraints; int mergeIndex; - btBatchInfo(int _phaseId = -1) : numConstraints(0), mergeIndex(-1), phaseId(_phaseId) {} + btBatchInfo() : numConstraints(0), mergeIndex(kNoMerge) {} }; @@ -728,7 +727,6 @@ struct AssignConstraintsToGridBatchesParams btIntVec3* bodyGridCoords; int numBodies; btBatchedConstraintInfo* conInfos; - char* constraintPhaseIds; int* constraintBatchIds; btIntVec3 gridChunkDim; int maxNumBatchesPerPhase; @@ -807,7 +805,6 @@ static void assignConstraintsToGridBatches(const AssignConstraintsToGridBatchesP } int iBatch = iPhase * params.maxNumBatchesPerPhase + chunkCoord[ 0 ] + chunkCoord[ 1 ] * gridChunkDim[ 0 ] + chunkCoord[ 2 ] * gridChunkDim[ 0 ] * gridChunkDim[ 1 ]; btAssert(iBatch >= 0 && iBatch < params.maxNumBatchesPerPhase*params.numPhases); - params.constraintPhaseIds[ iCon ] = iPhase; params.constraintBatchIds[ iCon ] = iBatch; } } @@ -834,8 +831,7 @@ struct AssignConstraintsToGridBatchesLoop : public btIParallelForBody /* Bodies are treated as 3D points at their center of mass. We only consider dynamic bodies at this stage, -kinematic and static bodies are dealt with at a later stage. Also we only consider constraints that -are between 2 dynamic bodies ("dynamic" constraints) -- constraints that involve a static or kinematic body are handled later +because only dynamic bodies are mutated when a constraint is solved, thus subject to race conditions. 1. Compute a bounding box around all dynamic bodies 2. Compute the maximum extent of all dynamic constraints. Each dynamic constraint is treated as a line segment, and we need the size of @@ -845,15 +841,16 @@ are between 2 dynamic bodies ("dynamic" constraints) -- constraints that involve so that no dynamic constraint can span more than 2 cells of our grid on any axis of the grid. The cell size should be adjusted larger in order to keep the total number of cells from being excessively high -Key idea: Given that each constraint spans 1 or 2 grid cells in each dimension, we can handle all dynamic constraints by processing +Key idea: Given that each constraint spans 1 or 2 grid cells in each dimension, we can handle all constraints by processing in chunks of 2x2x2 cells with 8 different 1-cell offsets ((0,0,0),(0,0,1),(0,1,0),(0,1,1),(1,0,0)...). For each of the 8 offsets, we create a phase, and for each 2x2x2 chunk with dynamic constraints becomes a batch in that phase. - Once all of the phases have been populated, if any of the phases end up with too few batches, they could possibly be merged with other phases. +4. Once the grid is established, we can calculate for each constraint which phase and batch it belongs in. - Finally, we handle all of the remaining (non-dynamic) constraints, these can be added to whichever phase is least populated to help - even things out +5. Do a merge small batches on the batches of each phase separately, to try to even out the sizes of batches +Optionally, we can "collapse" one dimension of our 3D grid to turn it into a 2D grid, which reduces the number of phases +to 4. With fewer phases, there are more constraints per phase and this makes it easier to create batches of a useful size. */ // static void setupSpatialGridBatchesMt( @@ -882,7 +879,6 @@ static void setupSpatialGridBatchesMt( btBatchInfo* batches = NULL; int* batchWork = NULL; btBatchedConstraintInfo* conInfos = NULL; - char* constraintPhaseIds = NULL; int* constraintBatchIds = NULL; int* constraintRowBatchIds = NULL; { @@ -893,7 +889,6 @@ static void setupSpatialGridBatchesMt( memHelper.addChunk( (void**) &batches, sizeof( btBatchInfo )* allocNumBatches ); memHelper.addChunk( (void**) &batchWork, sizeof( int )* allocNumBatches ); memHelper.addChunk( (void**) &conInfos, sizeof( btBatchedConstraintInfo ) * numConstraints ); - memHelper.addChunk( (void**) &constraintPhaseIds, sizeof( char ) * numConstraints ); memHelper.addChunk( (void**) &constraintBatchIds, sizeof( int ) * numConstraints ); memHelper.addChunk( (void**) &constraintRowBatchIds, sizeof( int ) * numConstraintRows ); size_t scratchSize = memHelper.getSizeToAllocate(); @@ -1010,7 +1005,7 @@ static void setupSpatialGridBatchesMt( for ( int iBatch = batchBegin; iBatch < batchEnd; ++iBatch ) { btBatchInfo& batch = batches[ iBatch ]; - batch = btBatchInfo( iPhase ); + batch = btBatchInfo(); } } @@ -1020,7 +1015,6 @@ static void setupSpatialGridBatchesMt( params.bodyGridCoords = bodyGridCoords; params.numBodies = bodies.size(); params.conInfos = conInfos; - params.constraintPhaseIds = constraintPhaseIds; params.constraintBatchIds = constraintBatchIds; params.gridChunkDim = gridChunkDim; params.maxNumBatchesPerPhase = maxNumBatchesPerPhase; @@ -1030,7 +1024,7 @@ static void setupSpatialGridBatchesMt( if (inParallel) { AssignConstraintsToGridBatchesLoop loop(params); - int grainSize = 500; + int grainSize = 250; btParallelFor(0, numConstraints, grainSize, loop); } else diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp index b09665b15..b9ad17a03 100644 --- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp +++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp @@ -940,7 +940,7 @@ void btSequentialImpulseConstraintSolverMt::solveGroupCacheFriendlySplitImpulseI { int iPhase = batchedCons.m_phaseOrder[ iiPhase ]; const btBatchedConstraints::Range& phase = batchedCons.m_phases[ iPhase ]; - int grainSize = 8; + int grainSize = batchedCons.m_phaseGrainSize[iPhase]; leastSquaresResidual += btParallelSum( phase.begin, phase.end, grainSize, loop ); } } diff --git a/src/LinearMath/TaskScheduler/btTaskScheduler.cpp b/src/LinearMath/TaskScheduler/btTaskScheduler.cpp index e02458367..1aa7d44d4 100644 --- a/src/LinearMath/TaskScheduler/btTaskScheduler.cpp +++ b/src/LinearMath/TaskScheduler/btTaskScheduler.cpp @@ -17,42 +17,12 @@ typedef void* ( *btThreadLocalStorageFunc )(); -/// -/// getNumHardwareThreads() -/// -/// -/// https://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine -/// -#if __cplusplus >= 201103L - -#include - -int getNumHardwareThreads() -{ - return std::thread::hardware_concurrency(); -} - -#elif defined( _WIN32 ) +#if defined( _WIN32 ) #define WIN32_LEAN_AND_MEAN #include -int getNumHardwareThreads() -{ - // caps out at 32 - SYSTEM_INFO info; - GetSystemInfo( &info ); - return info.dwNumberOfProcessors; -} - -#else - -int getNumHardwareThreads() -{ - return 0; // don't know -} - #endif @@ -581,7 +551,6 @@ public: // put the main thread to work on emptying the job queue and then wait for all workers to finish waitJobs(); - m_antiNestingLock.unlock(); // add up all the thread sums btScalar sum = btScalar(0); @@ -589,6 +558,7 @@ public: { sum += threadLocalSum[ iThread ].mSum; } + m_antiNestingLock.unlock(); return sum; } else diff --git a/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp b/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp index f77616337..de693590e 100644 --- a/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp +++ b/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp @@ -267,8 +267,8 @@ DWORD WINAPI win32threadStartFunc( LPVOID lpParam ) void btThreadSupportWin32::runTask( int threadIndex, void* userData ) { btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ]; - btAssert( taskId >= 0 ); - btAssert( int( taskId ) < m_activeThreadStatus.size() ); + btAssert( threadIndex >= 0 ); + btAssert( int( threadIndex ) < m_activeThreadStatus.size() ); threadStatus.m_commandId = 1; threadStatus.m_status = 1; From e526e48df837d282f4a59554750769b21bbc2104 Mon Sep 17 00:00:00 2001 From: Lunkhound Date: Tue, 13 Mar 2018 04:19:02 -0700 Subject: [PATCH 6/8] parallel solver: slightly overallocate to reduce how often allocation is needed --- .../ConstraintSolver/btBatchedConstraints.cpp | 6 ++++++ .../btSequentialImpulseConstraintSolverMt.cpp | 20 +++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp index 310601659..84a00dc63 100644 --- a/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp +++ b/src/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp @@ -892,6 +892,12 @@ static void setupSpatialGridBatchesMt( memHelper.addChunk( (void**) &constraintBatchIds, sizeof( int ) * numConstraints ); memHelper.addChunk( (void**) &constraintRowBatchIds, sizeof( int ) * numConstraintRows ); size_t scratchSize = memHelper.getSizeToAllocate(); + // if we need to reallocate + if (scratchMemory->capacity() < scratchSize) + { + // allocate 6.25% extra to avoid repeated reallocs + scratchMemory->reserve( scratchSize + scratchSize/16 ); + } scratchMemory->resizeNoInitialize( scratchSize ); char* memPtr = &scratchMemory->at(0); memHelper.setChunkPointers( memPtr ); diff --git a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp index b9ad17a03..4ccf7b247 100644 --- a/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp +++ b/src/BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolverMt.cpp @@ -568,10 +568,22 @@ void btSequentialImpulseConstraintSolverMt::allocAllContactConstraints(btPersist } } } - m_tmpSolverContactConstraintPool.resizeNoInitialize(numContacts); - m_rollingFrictionIndexTable.resizeNoInitialize(numContacts); - m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize(numContacts*m_numFrictionDirections); - m_tmpSolverContactRollingFrictionConstraintPool.resizeNoInitialize(numRollingFrictionConstraints); + { + BT_PROFILE( "allocPools" ); + if ( m_tmpSolverContactConstraintPool.capacity() < numContacts ) + { + // if we need to reallocate, reserve some extra so we don't have to reallocate again next frame + int extraReserve = numContacts / 16; + m_tmpSolverContactConstraintPool.reserve( numContacts + extraReserve ); + m_rollingFrictionIndexTable.reserve( numContacts + extraReserve ); + m_tmpSolverContactFrictionConstraintPool.reserve( ( numContacts + extraReserve )*m_numFrictionDirections ); + m_tmpSolverContactRollingFrictionConstraintPool.reserve( numRollingFrictionConstraints + extraReserve ); + } + m_tmpSolverContactConstraintPool.resizeNoInitialize( numContacts ); + m_rollingFrictionIndexTable.resizeNoInitialize( numContacts ); + m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize( numContacts*m_numFrictionDirections ); + m_tmpSolverContactRollingFrictionConstraintPool.resizeNoInitialize( numRollingFrictionConstraints ); + } } { AllocContactConstraintsLoop loop(this, &cachedInfoArray[0]); From 04e0d57dc1272390eee711f354e3a6879855d7b2 Mon Sep 17 00:00:00 2001 From: Lunkhound Date: Fri, 16 Mar 2018 23:42:43 -0700 Subject: [PATCH 7/8] add premake option 'enable-multithreading' --- build3/premake4.lua | 8 ++++++++ src/LinearMath/premake4.lua | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/build3/premake4.lua b/build3/premake4.lua index a39e926e5..d49b9d440 100644 --- a/build3/premake4.lua +++ b/build3/premake4.lua @@ -182,6 +182,14 @@ end trigger = "audio", description = "Enable audio" } + newoption + { + trigger = "enable_multithreading", + description = "enable CPU multithreading for bullet2 libs" + } + if _OPTIONS["enable_multithreading"] then + defines {"BT_THREADSAFE=1"} + end if _OPTIONS["double"] then defines {"BT_USE_DOUBLE_PRECISION"} end diff --git a/src/LinearMath/premake4.lua b/src/LinearMath/premake4.lua index 5f0fda6bf..3765811a9 100644 --- a/src/LinearMath/premake4.lua +++ b/src/LinearMath/premake4.lua @@ -9,5 +9,7 @@ } files { "*.cpp", - "*.h" + "*.h", + "TaskScheduler/*.cpp", + "TaskScheduler/*.h" } From bdc3c2bafb2d51f954da3b203233bc27b9224959 Mon Sep 17 00:00:00 2001 From: Lunkhound Date: Fri, 16 Mar 2018 16:38:11 -0700 Subject: [PATCH 8/8] task scheduler: add multiple job queues to improve performance when there are many threads --- .../TaskScheduler/btTaskScheduler.cpp | 557 ++++++++++++------ .../TaskScheduler/btThreadSupportInterface.h | 9 +- .../TaskScheduler/btThreadSupportPosix.cpp | 11 +- .../TaskScheduler/btThreadSupportWin32.cpp | 14 +- 4 files changed, 386 insertions(+), 205 deletions(-) diff --git a/src/LinearMath/TaskScheduler/btTaskScheduler.cpp b/src/LinearMath/TaskScheduler/btTaskScheduler.cpp index 1aa7d44d4..02fe07ab1 100644 --- a/src/LinearMath/TaskScheduler/btTaskScheduler.cpp +++ b/src/LinearMath/TaskScheduler/btTaskScheduler.cpp @@ -7,16 +7,11 @@ #include -typedef void( *btThreadFunc )( void* userPtr, void* lsMemory ); -typedef void* ( *btThreadLocalStorageFunc )(); #if BT_THREADSAFE #include "btThreadSupportInterface.h" - - - #if defined( _WIN32 ) #define WIN32_LEAN_AND_MEAN @@ -26,6 +21,9 @@ typedef void* ( *btThreadLocalStorageFunc )(); #endif +typedef unsigned long long btU64; +static const int kCacheLineSize = 64; + void btSpinPause() { #if defined( _WIN32 ) @@ -46,6 +44,62 @@ struct WorkerThreadStatus }; +ATTRIBUTE_ALIGNED64(class) WorkerThreadDirectives +{ + static const int kMaxThreadCount = BT_MAX_THREAD_COUNT; + // directives for all worker threads packed into a single cacheline + char m_threadDirs[kMaxThreadCount]; + +public: + enum Type + { + kInvalid, + kGoToSleep, // go to sleep + kStayAwakeButIdle, // wait for not checking job queue + kScanForJobs, // actively scan job queue for jobs + }; + WorkerThreadDirectives() + { + for ( int i = 0; i < kMaxThreadCount; ++i ) + { + m_threadDirs[ i ] = 0; + } + } + + Type getDirective(int threadId) + { + btAssert(threadId < kMaxThreadCount); + return static_cast(m_threadDirs[threadId]); + } + + void setDirectiveByRange(int threadBegin, int threadEnd, Type dir) + { + btAssert( threadBegin < threadEnd ); + btAssert( threadEnd <= kMaxThreadCount ); + char dirChar = static_cast(dir); + for ( int i = threadBegin; i < threadEnd; ++i ) + { + m_threadDirs[ i ] = dirChar; + } + } +}; + +class JobQueue; + +ATTRIBUTE_ALIGNED64(struct) ThreadLocalStorage +{ + int m_threadId; + WorkerThreadStatus::Type m_status; + int m_numJobsFinished; + btSpinMutex m_mutex; + btScalar m_sumResult; + WorkerThreadDirectives * m_directive; + JobQueue* m_queue; + btClock* m_clock; + unsigned int m_cooldownTime; +}; + + struct IJob { virtual void executeJob(int threadId) = 0; @@ -53,88 +107,152 @@ struct IJob class ParallelForJob : public IJob { - const btIParallelForBody* mBody; - int mBegin; - int mEnd; + const btIParallelForBody* m_body; + int m_begin; + int m_end; public: ParallelForJob( int iBegin, int iEnd, const btIParallelForBody& body ) { - mBody = &body; - mBegin = iBegin; - mEnd = iEnd; + m_body = &body; + m_begin = iBegin; + m_end = iEnd; } virtual void executeJob(int threadId) BT_OVERRIDE { BT_PROFILE( "executeJob" ); // call the functor body to do the work - mBody->forLoop( mBegin, mEnd ); + m_body->forLoop( m_begin, m_end ); } }; -static const int kCacheLineSize = 64; - -struct ThreadLocalSum -{ - btScalar mSum; - char mCachePadding[ kCacheLineSize - sizeof( btScalar ) ]; -}; class ParallelSumJob : public IJob { - const btIParallelSumBody* mBody; - ThreadLocalSum* mSumArray; - int mBegin; - int mEnd; + const btIParallelSumBody* m_body; + ThreadLocalStorage* m_threadLocalStoreArray; + int m_begin; + int m_end; public: - ParallelSumJob( int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalSum* sums ) + ParallelSumJob( int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalStorage* tls ) { - mBody = &body; - mSumArray = sums; - mBegin = iBegin; - mEnd = iEnd; + m_body = &body; + m_threadLocalStoreArray = tls; + m_begin = iBegin; + m_end = iEnd; } virtual void executeJob( int threadId ) BT_OVERRIDE { BT_PROFILE( "executeJob" ); // call the functor body to do the work - btScalar val = mBody->sumLoop( mBegin, mEnd ); + btScalar val = m_body->sumLoop( m_begin, m_end ); +#if BT_PARALLEL_SUM_DETERMINISTISM // by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision) const float TRUNC_SCALE = float(1<<19); val = floor(val*TRUNC_SCALE+0.5f)/TRUNC_SCALE; // truncate some bits - mSumArray[threadId].mSum += val; +#endif + m_threadLocalStoreArray[threadId].m_sumResult += val; } }; -struct JobContext +ATTRIBUTE_ALIGNED64(class) JobQueue { - JobContext() - { - m_queueLock = NULL; - m_headIndex = 0; - m_tailIndex = 0; - m_workersShouldCheckQueue = false; - m_workersShouldSleep = false; - m_useSpinMutex = false; - m_coolDownTime = 1000; // 1000 microseconds - } + btThreadSupportInterface* m_threadSupport; btCriticalSection* m_queueLock; btSpinMutex m_mutex; - volatile bool m_workersShouldCheckQueue; - volatile bool m_workersShouldSleep; btAlignedObjectArray m_jobQueue; + char* m_jobMem; + int m_jobMemSize; bool m_queueIsEmpty; int m_tailIndex; int m_headIndex; + int m_allocSize; bool m_useSpinMutex; - unsigned int m_coolDownTime; - btClock m_clock; + btAlignedObjectArray m_neighborContexts; + char m_cachePadding[kCacheLineSize]; // prevent false sharing + void freeJobMem() + { + if ( m_jobMem ) + { + // free old + btAlignedFree(m_jobMem); + m_jobMem = NULL; + } + } + void resizeJobMem(int newSize) + { + if (newSize > m_jobMemSize) + { + freeJobMem(); + m_jobMem = static_cast(btAlignedAlloc(newSize, kCacheLineSize)); + m_jobMemSize = newSize; + } + } + +public: + + JobQueue() + { + m_jobMem = NULL; + m_jobMemSize = 0; + m_threadSupport = NULL; + m_queueLock = NULL; + m_headIndex = 0; + m_tailIndex = 0; + m_useSpinMutex = false; + } + ~JobQueue() + { + freeJobMem(); + if (m_queueLock && m_threadSupport) + { + m_threadSupport->deleteCriticalSection(m_queueLock); + m_queueLock = NULL; + } + } + void init(btThreadSupportInterface* threadSup, btAlignedObjectArray* contextArray) + { + m_threadSupport = threadSup; + if (threadSup) + { + m_queueLock = m_threadSupport->createCriticalSection(); + } + setupJobStealing(contextArray, contextArray->size()); + } + void setupJobStealing(btAlignedObjectArray* contextArray, int numActiveContexts) + { + btAlignedObjectArray& contexts = *contextArray; + int selfIndex = 0; + for (int i = 0; i < contexts.size(); ++i) + { + if ( this == &contexts[ i ] ) + { + selfIndex = i; + break; + } + } + int numNeighbors = btMin(2, contexts.size() - 1); + int neighborOffsets[ ] = {-1, 1, -2, 2, -3, 3}; + int numOffsets = sizeof(neighborOffsets)/sizeof(neighborOffsets[0]); + m_neighborContexts.reserve( numNeighbors ); + m_neighborContexts.resizeNoInitialize(0); + for (int i = 0; i < numOffsets && m_neighborContexts.size() < numNeighbors; i++) + { + int neighborIndex = selfIndex + neighborOffsets[i]; + if ( neighborIndex >= 0 && neighborIndex < numActiveContexts) + { + m_neighborContexts.push_back( &contexts[ neighborIndex ] ); + } + } + } + + bool isQueueEmpty() const {return m_queueIsEmpty;} void lockQueue() { if ( m_useSpinMutex ) @@ -157,24 +275,44 @@ struct JobContext m_queueLock->unlock(); } } - void clearQueue() + void clearQueue(int jobCount, int jobSize) { lockQueue(); m_headIndex = 0; m_tailIndex = 0; + m_allocSize = 0; m_queueIsEmpty = true; + int jobBufSize = jobSize * jobCount; + // make sure we have enough memory allocated to store jobs + if ( jobBufSize > m_jobMemSize ) + { + resizeJobMem( jobBufSize ); + } + // make sure job queue is big enough + if ( jobCount > m_jobQueue.capacity() ) + { + m_jobQueue.reserve( jobCount ); + } unlockQueue(); m_jobQueue.resizeNoInitialize( 0 ); } + void* allocJobMem(int jobSize) + { + btAssert(m_jobMemSize >= (m_allocSize + jobSize)); + void* jobMem = &m_jobMem[m_allocSize]; + m_allocSize += jobSize; + return jobMem; + } void submitJob( IJob* job ) { + btAssert( reinterpret_cast( job ) >= &m_jobMem[ 0 ] && reinterpret_cast( job ) < &m_jobMem[ 0 ] + m_allocSize ); m_jobQueue.push_back( job ); lockQueue(); m_tailIndex++; m_queueIsEmpty = false; unlockQueue(); } - IJob* consumeJob() + IJob* consumeJobFromOwnQueue() { if ( m_queueIsEmpty ) { @@ -186,6 +324,7 @@ struct JobContext if ( !m_queueIsEmpty ) { job = m_jobQueue[ m_headIndex++ ]; + btAssert( reinterpret_cast( job ) >= &m_jobMem[ 0 ] && reinterpret_cast( job ) < &m_jobMem[ 0 ] + m_allocSize ); if ( m_headIndex == m_tailIndex ) { m_queueIsEmpty = true; @@ -194,58 +333,78 @@ struct JobContext unlockQueue(); return job; } + IJob* consumeJob() + { + if (IJob* job = consumeJobFromOwnQueue()) + { + return job; + } + // own queue is empty, try to steal from neighbor + for (int i = 0; i < m_neighborContexts.size(); ++i) + { + JobQueue* otherContext = m_neighborContexts[ i ]; + if ( IJob* job = otherContext->consumeJobFromOwnQueue() ) + { + return job; + } + } + return NULL; + } }; -struct WorkerThreadLocalStorage -{ - int threadId; - WorkerThreadStatus::Type status; - int numJobsFinished; - btSpinMutex m_mutex; -}; - - -static void WorkerThreadFunc( void* userPtr, void* lsMemory ) +static void WorkerThreadFunc( void* userPtr ) { BT_PROFILE( "WorkerThreadFunc" ); - WorkerThreadLocalStorage* localStorage = (WorkerThreadLocalStorage*) lsMemory; - JobContext* jobContext = (JobContext*) userPtr; + ThreadLocalStorage* localStorage = (ThreadLocalStorage*) userPtr; + JobQueue* jobQueue = localStorage->m_queue; bool shouldSleep = false; + int threadId = localStorage->m_threadId; while (! shouldSleep) { // do work localStorage->m_mutex.lock(); - while ( IJob* job = jobContext->consumeJob() ) + while ( IJob* job = jobQueue->consumeJob() ) { - localStorage->status = WorkerThreadStatus::kWorking; - job->executeJob( localStorage->threadId ); - localStorage->numJobsFinished++; + localStorage->m_status = WorkerThreadStatus::kWorking; + job->executeJob( threadId ); + localStorage->m_numJobsFinished++; } - localStorage->status = WorkerThreadStatus::kWaitingForWork; + localStorage->m_status = WorkerThreadStatus::kWaitingForWork; localStorage->m_mutex.unlock(); - unsigned long long int clockStart = jobContext->m_clock.getTimeMicroseconds(); + btU64 clockStart = localStorage->m_clock->getTimeMicroseconds(); // while queue is empty, - while (jobContext->m_queueIsEmpty) + while (jobQueue->isQueueEmpty()) { // todo: spin wait a bit to avoid hammering the empty queue btSpinPause(); - if ( jobContext->m_workersShouldSleep ) + if ( localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kGoToSleep ) { shouldSleep = true; break; } // if jobs are incoming, - if (jobContext->m_workersShouldCheckQueue) + if ( localStorage->m_directive->getDirective( threadId ) == WorkerThreadDirectives::kScanForJobs ) { - clockStart = jobContext->m_clock.getTimeMicroseconds(); // reset clock + clockStart = localStorage->m_clock->getTimeMicroseconds(); // reset clock } else { + for ( int i = 0; i < 50; ++i ) + { + btSpinPause(); + btSpinPause(); + btSpinPause(); + btSpinPause(); + if (localStorage->m_directive->getDirective( threadId ) == WorkerThreadDirectives::kScanForJobs || !jobQueue->isQueueEmpty()) + { + break; + } + } // if no jobs incoming and queue has been empty for the cooldown time, sleep - unsigned long long int timeElapsed = jobContext->m_clock.getTimeMicroseconds() - clockStart; - if (timeElapsed > jobContext->m_coolDownTime) + btU64 timeElapsed = localStorage->m_clock->getTimeMicroseconds() - clockStart; + if (timeElapsed > localStorage->m_cooldownTime) { shouldSleep = true; break; @@ -254,77 +413,107 @@ static void WorkerThreadFunc( void* userPtr, void* lsMemory ) } } - // go idle + // go sleep localStorage->m_mutex.lock(); - localStorage->status = WorkerThreadStatus::kSleeping; + localStorage->m_status = WorkerThreadStatus::kSleeping; localStorage->m_mutex.unlock(); } -static void* WorkerThreadAllocFunc() -{ - return new WorkerThreadLocalStorage; -} - - - class btTaskSchedulerDefault : public btITaskScheduler { - JobContext m_jobContext; btThreadSupportInterface* m_threadSupport; - btAlignedObjectArray m_jobMem; - btAlignedObjectArray m_threadLocalMem; + WorkerThreadDirectives* m_workerDirective; + btAlignedObjectArray m_jobQueues; + btAlignedObjectArray m_perThreadJobQueues; + btAlignedObjectArray m_threadLocalStorage; btSpinMutex m_antiNestingLock; // prevent nested parallel-for + btClock m_clock; int m_numThreads; int m_numWorkerThreads; + int m_numActiveJobQueues; int m_maxNumThreads; int m_numJobs; + static const int kFirstWorkerThreadId = 1; public: btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport") { m_threadSupport = NULL; + m_workerDirective = NULL; } virtual ~btTaskSchedulerDefault() { - shutdown(); + waitForWorkersToSleep(); + if (m_threadSupport) + { + delete m_threadSupport; + m_threadSupport = NULL; + } + if (m_workerDirective) + { + btAlignedFree(m_workerDirective); + m_workerDirective = NULL; + } } void init() { - btThreadSupportInterface::ConstructionInfo constructionInfo( "TaskScheduler", WorkerThreadFunc, WorkerThreadAllocFunc ); + btThreadSupportInterface::ConstructionInfo constructionInfo( "TaskScheduler", WorkerThreadFunc ); m_threadSupport = btThreadSupportInterface::create( constructionInfo ); + m_workerDirective = static_cast(btAlignedAlloc(sizeof(*m_workerDirective), 64)); m_numWorkerThreads = m_threadSupport->getNumWorkerThreads(); m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1; m_numThreads = m_maxNumThreads; - m_jobContext.m_queueLock = m_threadSupport->createCriticalSection(); - for ( int i = 0; i < m_numWorkerThreads; i++ ) + // ideal to have one job queue for each physical processor (except for the main thread which needs no queue) + int numThreadsPerQueue = m_threadSupport->getLogicalToPhysicalCoreRatio(); + int numJobQueues = (numThreadsPerQueue == 1) ? (m_maxNumThreads-1) : (m_maxNumThreads / numThreadsPerQueue); + m_jobQueues.resize(numJobQueues); + m_numActiveJobQueues = numJobQueues; + for ( int i = 0; i < m_jobQueues.size(); ++i ) { - WorkerThreadLocalStorage* storage = (WorkerThreadLocalStorage*) m_threadSupport->getThreadLocalMemory( i ); - btAssert( storage ); - storage->threadId = i + 1; // workers start at 1 - storage->status = WorkerThreadStatus::kSleeping; + m_jobQueues[i].init( m_threadSupport, &m_jobQueues ); } - setWorkersActive( false ); // no work for them yet + m_perThreadJobQueues.resize(m_numThreads); + for ( int i = 0; i < m_numThreads; i++ ) + { + JobQueue* jq = NULL; + // only worker threads get a job queue + if (i > 0) + { + if (numThreadsPerQueue == 1) + { + // one queue per worker thread + jq = &m_jobQueues[ i - kFirstWorkerThreadId ]; + } + else + { + // 2 threads share each queue + jq = &m_jobQueues[ i / numThreadsPerQueue ]; + } + } + m_perThreadJobQueues[i] = jq; + } + m_threadLocalStorage.resize(m_numThreads); + for ( int i = 0; i < m_numThreads; i++ ) + { + ThreadLocalStorage& storage = m_threadLocalStorage[i]; + storage.m_threadId = i; + storage.m_directive = m_workerDirective; + storage.m_status = WorkerThreadStatus::kSleeping; + storage.m_cooldownTime = 1000; // 1000 microseconds, threads go to sleep after this long if they have nothing to do + storage.m_clock = &m_clock; + storage.m_queue = m_perThreadJobQueues[i]; + } + setWorkerDirectives( WorkerThreadDirectives::kGoToSleep ); // no work for them yet setNumThreads( m_threadSupport->getCacheFriendlyNumThreads() ); } - virtual void shutdown() + void setWorkerDirectives(WorkerThreadDirectives::Type dir) { - setWorkersActive( false ); - waitForWorkersToSleep(); - m_threadSupport->deleteCriticalSection( m_jobContext.m_queueLock ); - m_jobContext.m_queueLock = NULL; - - delete m_threadSupport; - m_threadSupport = NULL; - } - - void setWorkersActive( bool active ) - { - m_jobContext.m_workersShouldCheckQueue = active; + m_workerDirective->setDirectiveByRange(kFirstWorkerThreadId, m_numThreads, dir); } virtual int getMaxNumThreads() const BT_OVERRIDE @@ -341,38 +530,56 @@ public: { m_numThreads = btMax( btMin(numThreads, int(m_maxNumThreads)), 1 ); m_numWorkerThreads = m_numThreads - 1; + m_numActiveJobQueues = 0; + // if there is at least 1 worker, + if ( m_numWorkerThreads > 0 ) + { + // re-setup job stealing between queues to avoid attempting to steal from an inactive job queue + JobQueue* lastActiveContext = m_perThreadJobQueues[ m_numThreads - 1 ]; + int iLastActiveContext = lastActiveContext - &m_jobQueues[0]; + m_numActiveJobQueues = iLastActiveContext + 1; + for ( int i = 0; i < m_jobQueues.size(); ++i ) + { + m_jobQueues[ i ].setupJobStealing( &m_jobQueues, m_numActiveJobQueues ); + } + } + m_workerDirective->setDirectiveByRange(m_numThreads, BT_MAX_THREAD_COUNT, WorkerThreadDirectives::kGoToSleep); } void waitJobs() { BT_PROFILE( "waitJobs" ); - // have the main thread work until the job queue is empty + // have the main thread work until the job queues are empty int numMainThreadJobsFinished = 0; - while ( IJob* job = m_jobContext.consumeJob() ) + for ( int i = 0; i < m_numActiveJobQueues; ++i ) { - job->executeJob( 0 ); - numMainThreadJobsFinished++; + while ( IJob* job = m_jobQueues[i].consumeJob() ) + { + job->executeJob( 0 ); + numMainThreadJobsFinished++; + } } - // done with jobs for now, tell workers to rest - setWorkersActive( false ); - unsigned long long int clockStart = m_jobContext.m_clock.getTimeMicroseconds(); + // done with jobs for now, tell workers to rest (but not sleep) + setWorkerDirectives( WorkerThreadDirectives::kStayAwakeButIdle ); + + btU64 clockStart = m_clock.getTimeMicroseconds(); // wait for workers to finish any jobs in progress while ( true ) { int numWorkerJobsFinished = 0; - for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker ) + for ( int iThread = kFirstWorkerThreadId; iThread < m_numThreads; ++iThread ) { - WorkerThreadLocalStorage* storage = static_cast( m_threadSupport->getThreadLocalMemory( iWorker ) ); + ThreadLocalStorage* storage = &m_threadLocalStorage[iThread]; storage->m_mutex.lock(); - numWorkerJobsFinished += storage->numJobsFinished; + numWorkerJobsFinished += storage->m_numJobsFinished; storage->m_mutex.unlock(); } if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs) { break; } - unsigned long long int timeElapsed = m_jobContext.m_clock.getTimeMicroseconds() - clockStart; + btU64 timeElapsed = m_clock.getTimeMicroseconds() - clockStart; btAssert(timeElapsed < 1000); if (timeElapsed > 100000) { @@ -385,25 +592,25 @@ public: void wakeWorkers(int numWorkersToWake) { BT_PROFILE( "wakeWorkers" ); - btAssert( m_jobContext.m_workersShouldCheckQueue ); + btAssert( m_workerDirective->getDirective(1) == WorkerThreadDirectives::kScanForJobs ); int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads); int numActiveWorkers = 0; for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker ) { // note this count of active workers is not necessarily totally reliable, because a worker thread could be // just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare. - WorkerThreadLocalStorage* storage = static_cast( m_threadSupport->getThreadLocalMemory( iWorker ) ); - if (storage->status != WorkerThreadStatus::kSleeping) + ThreadLocalStorage& storage = m_threadLocalStorage[ kFirstWorkerThreadId + iWorker ]; + if (storage.m_status != WorkerThreadStatus::kSleeping) { numActiveWorkers++; } } for ( int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker ) { - WorkerThreadLocalStorage* storage = static_cast( m_threadSupport->getThreadLocalMemory( iWorker ) ); - if (storage->status == WorkerThreadStatus::kSleeping) + ThreadLocalStorage& storage = m_threadLocalStorage[ kFirstWorkerThreadId + iWorker ]; + if (storage.m_status == WorkerThreadStatus::kSleeping) { - m_threadSupport->runTask( iWorker, &m_jobContext ); + m_threadSupport->runTask( iWorker, &storage ); numActiveWorkers++; } } @@ -412,13 +619,12 @@ public: void waitForWorkersToSleep() { BT_PROFILE( "waitForWorkersToSleep" ); - m_jobContext.m_workersShouldSleep = true; + setWorkerDirectives( WorkerThreadDirectives::kGoToSleep ); m_threadSupport->waitForAllTasks(); - for ( int i = 0; i < m_numWorkerThreads; i++ ) + for ( int i = kFirstWorkerThreadId; i < m_numThreads; i++ ) { - WorkerThreadLocalStorage* storage = static_cast( m_threadSupport->getThreadLocalMemory(i) ); - btAssert( storage ); - btAssert( storage->status == WorkerThreadStatus::kSleeping ); + ThreadLocalStorage& storage = m_threadLocalStorage[i]; + btAssert( storage.m_status == WorkerThreadStatus::kSleeping ); } } @@ -426,20 +632,19 @@ public: { BT_PROFILE( "sleepWorkerThreadsHint" ); // hint the task scheduler that we may not be using these threads for a little while - m_jobContext.m_workersShouldSleep = true; + setWorkerDirectives( WorkerThreadDirectives::kGoToSleep ); } void prepareWorkerThreads() { - for ( int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker ) + for ( int i = kFirstWorkerThreadId; i < m_numThreads; ++i ) { - WorkerThreadLocalStorage* storage = static_cast( m_threadSupport->getThreadLocalMemory( iWorker ) ); - storage->m_mutex.lock(); - storage->numJobsFinished = 0; - storage->m_mutex.unlock(); + ThreadLocalStorage& storage = m_threadLocalStorage[i]; + storage.m_mutex.lock(); + storage.m_numJobsFinished = 0; + storage.m_mutex.unlock(); } - m_jobContext.m_workersShouldSleep = false; - setWorkersActive( true ); + setWorkerDirectives( WorkerThreadDirectives::kScanForJobs ); } virtual void parallelFor( int iBegin, int iEnd, int grainSize, const btIParallelForBody& body ) BT_OVERRIDE @@ -455,32 +660,32 @@ public: m_numJobs = jobCount; btAssert( jobCount >= 2 ); // need more than one job for multithreading int jobSize = sizeof( JobType ); - int jobBufSize = jobSize * jobCount; - // make sure we have enough memory allocated to store jobs - if ( jobBufSize > m_jobMem.size() ) - { - m_jobMem.resize( jobBufSize ); - } - // make sure job queue is big enough - if ( jobCount > m_jobContext.m_jobQueue.capacity() ) - { - m_jobContext.m_jobQueue.reserve( jobCount ); - } - m_jobContext.clearQueue(); + for (int i = 0; i < m_numActiveJobQueues; ++i) + { + m_jobQueues[i].clearQueue( jobCount, jobSize ); + } // prepare worker threads for incoming work prepareWorkerThreads(); // submit all of the jobs int iJob = 0; - JobType* jobs = reinterpret_cast( &m_jobMem[ 0 ] ); + int iThread = kFirstWorkerThreadId; // first worker thread for ( int i = iBegin; i < iEnd; i += grainSize ) { btAssert( iJob < jobCount ); int iE = btMin( i + grainSize, iEnd ); - JobType& job = jobs[ iJob ]; - new ( (void*) &job ) ParallelForJob( i, iE, body ); // placement new - m_jobContext.submitJob( &job ); + JobQueue* jq = m_perThreadJobQueues[ iThread ]; + btAssert(jq); + btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues); + void* jobMem = jq->allocJobMem(jobSize); + JobType* job = new ( jobMem ) ParallelForJob( i, iE, body ); // placement new + jq->submitJob( job ); iJob++; + iThread++; + if ( iThread >= m_numThreads ) + { + iThread = kFirstWorkerThreadId; // first worker thread + } } wakeWorkers( jobCount - 1 ); @@ -508,44 +713,38 @@ public: m_numJobs = jobCount; btAssert( jobCount >= 2 ); // need more than one job for multithreading int jobSize = sizeof( JobType ); - int jobBufSize = jobSize * jobCount; - // make sure we have enough memory allocated to store jobs - if ( jobBufSize > m_jobMem.size() ) + for (int i = 0; i < m_numActiveJobQueues; ++i) { - m_jobMem.resize( jobBufSize ); - } - // make sure job queue is big enough - if ( jobCount > m_jobContext.m_jobQueue.capacity() ) - { - m_jobContext.m_jobQueue.reserve( jobCount ); - } - // make sure thread local area is big enough - int threadLocalSize = m_numThreads * sizeof( ThreadLocalSum ); - if ( threadLocalSize > m_threadLocalMem.size() ) - { - m_threadLocalMem.resize( threadLocalSize ); - } - // initialize summation - ThreadLocalSum* threadLocalSum = reinterpret_cast( &m_threadLocalMem[ 0 ] ); - for ( int iThread = 0; iThread < m_numThreads; ++iThread ) - { - threadLocalSum[ iThread ].mSum = btScalar( 0 ); + m_jobQueues[i].clearQueue( jobCount, jobSize ); + } + + // initialize summation + for ( int iThread = 0; iThread < m_numThreads; ++iThread ) + { + m_threadLocalStorage[iThread].m_sumResult = btScalar(0); } - m_jobContext.clearQueue(); // prepare worker threads for incoming work prepareWorkerThreads(); // submit all of the jobs int iJob = 0; - JobType* jobs = reinterpret_cast( &m_jobMem[ 0 ] ); + int iThread = kFirstWorkerThreadId; // first worker thread for ( int i = iBegin; i < iEnd; i += grainSize ) { btAssert( iJob < jobCount ); int iE = btMin( i + grainSize, iEnd ); - JobType& job = jobs[ iJob ]; - new ( (void*) &job ) ParallelSumJob( i, iE, body, threadLocalSum ); // placement new - m_jobContext.submitJob( &job ); + JobQueue* jq = m_perThreadJobQueues[ iThread ]; + btAssert(jq); + btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues); + void* jobMem = jq->allocJobMem(jobSize); + JobType* job = new ( jobMem ) ParallelSumJob( i, iE, body, &m_threadLocalStorage[0] ); // placement new + jq->submitJob( job ); iJob++; + iThread++; + if ( iThread >= m_numThreads ) + { + iThread = kFirstWorkerThreadId; // first worker thread + } } wakeWorkers( jobCount - 1 ); @@ -556,7 +755,7 @@ public: btScalar sum = btScalar(0); for ( int iThread = 0; iThread < m_numThreads; ++iThread ) { - sum += threadLocalSum[ iThread ].mSum; + sum += m_threadLocalStorage[ iThread ].m_sumResult; } m_antiNestingLock.unlock(); return sum; @@ -586,4 +785,4 @@ btITaskScheduler* btCreateDefaultTaskScheduler() return NULL; } -#endif // #else // #if BT_THREADSAFE \ No newline at end of file +#endif // #else // #if BT_THREADSAFE diff --git a/src/LinearMath/TaskScheduler/btThreadSupportInterface.h b/src/LinearMath/TaskScheduler/btThreadSupportInterface.h index d537d7095..a0ad802b1 100644 --- a/src/LinearMath/TaskScheduler/btThreadSupportInterface.h +++ b/src/LinearMath/TaskScheduler/btThreadSupportInterface.h @@ -37,34 +37,29 @@ public: virtual int getNumWorkerThreads() const = 0; // number of worker threads (total number of logical processors - 1) virtual int getCacheFriendlyNumThreads() const = 0; // the number of logical processors sharing a single L3 cache + virtual int getLogicalToPhysicalCoreRatio() const = 0; // the number of logical processors per physical processor (usually 1 or 2) virtual void runTask( int threadIndex, void* userData ) = 0; virtual void waitForAllTasks() = 0; virtual btCriticalSection* createCriticalSection() = 0; virtual void deleteCriticalSection( btCriticalSection* criticalSection ) = 0; - virtual void* getThreadLocalMemory( int taskId ) { return NULL; } - - typedef void( *ThreadFunc )( void* userPtr, void* lsMemory ); - typedef void* ( *MemorySetupFunc )( ); + typedef void( *ThreadFunc )( void* userPtr ); struct ConstructionInfo { ConstructionInfo( const char* uniqueName, ThreadFunc userThreadFunc, - MemorySetupFunc lsMemoryFunc, int threadStackSize = 65535 ) :m_uniqueName( uniqueName ), m_userThreadFunc( userThreadFunc ), - m_lsMemoryFunc( lsMemoryFunc ), m_threadStackSize( threadStackSize ) { } const char* m_uniqueName; ThreadFunc m_userThreadFunc; - MemorySetupFunc m_lsMemoryFunc; int m_threadStackSize; }; diff --git a/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp b/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp index 5521fc555..ccd7d1e12 100644 --- a/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp +++ b/src/LinearMath/TaskScheduler/btThreadSupportPosix.cpp @@ -73,7 +73,6 @@ public: ThreadFunc m_userThreadFunc; void* m_userPtr; //for taskDesc etc - void* m_lsMemory; //initialized using PosixLocalStoreMemorySetupFunc pthread_t thread; //each tread will wait until this signal to start its work @@ -103,17 +102,14 @@ public: virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; } // TODO: return the number of logical processors sharing the first L3 cache virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return m_numThreads + 1; } + // TODO: detect if CPU has hyperthreading enabled + virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return 1; } virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE; virtual void waitForAllTasks() BT_OVERRIDE; virtual btCriticalSection* createCriticalSection() BT_OVERRIDE; virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE; - - virtual void* getThreadLocalMemory( int taskId ) BT_OVERRIDE - { - return m_activeThreadStatus[ taskId ].m_lsMemory; - } }; @@ -190,7 +186,7 @@ static void *threadFunction( void *argument ) if ( userPtr ) { btAssert( status->m_status ); - status->m_userThreadFunc( userPtr, status->m_lsMemory ); + status->m_userThreadFunc( userPtr ); status->m_status = 2; checkPThreadFunction( sem_post( status->m_mainSemaphore ) ); status->threadUsed++; @@ -292,7 +288,6 @@ void btThreadSupportPosix::startThreads( const ConstructionInfo& threadConstruct threadStatus.m_commandId = 0; threadStatus.m_status = 0; threadStatus.m_mainSemaphore = m_mainSemaphore; - threadStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc(); threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc; threadStatus.threadUsed = 0; diff --git a/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp b/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp index de693590e..00edac650 100644 --- a/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp +++ b/src/LinearMath/TaskScheduler/btThreadSupportWin32.cpp @@ -179,7 +179,6 @@ public: ThreadFunc m_userThreadFunc; void* m_userPtr; //for taskDesc etc - void* m_lsMemory; //initialized using Win32LocalStoreMemorySetupFunc void* m_threadHandle; //this one is calling 'Win32ThreadFunc' @@ -208,15 +207,11 @@ public: virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; } virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return countSetBits(m_processorInfo.processorTeamMasks[0]); } + virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return m_processorInfo.numLogicalProcessors / m_processorInfo.numCores; } virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE; virtual void waitForAllTasks() BT_OVERRIDE; - virtual void* getThreadLocalMemory( int taskId ) BT_OVERRIDE - { - return m_activeThreadStatus[ taskId ].m_lsMemory; - } - virtual btCriticalSection* createCriticalSection() BT_OVERRIDE; virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE; }; @@ -246,7 +241,7 @@ DWORD WINAPI win32threadStartFunc( LPVOID lpParam ) if ( userPtr ) { btAssert( status->m_status ); - status->m_userThreadFunc( userPtr, status->m_lsMemory ); + status->m_userThreadFunc( userPtr ); status->m_status = 2; SetEvent( status->m_eventCompleteHandle ); } @@ -392,7 +387,6 @@ void btThreadSupportWin32::startThreads( const ConstructionInfo& threadConstruct threadStatus.m_commandId = 0; threadStatus.m_status = 0; threadStatus.m_threadHandle = handle; - threadStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc(); threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc; printf( "started %s thread %d with threadHandle %p\n", threadConstructionInfo.m_uniqueName, i, handle ); @@ -410,9 +404,7 @@ void btThreadSupportWin32::stopThreads() WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE ); } - delete threadStatus.m_lsMemory; - - threadStatus.m_userPtr = 0; + threadStatus.m_userPtr = NULL; SetEvent( threadStatus.m_eventStartHandle ); WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE );