From fbc17731ec66da9626b52c7fe8830a03faf4049e Mon Sep 17 00:00:00 2001 From: "erwin.coumans" Date: Thu, 8 Jul 2010 17:02:38 +0000 Subject: [PATCH] Several changes to sync Bullet trunk with PlayStation 3 spubullet version Still needs some cross-platform fixes --- CMakeLists.txt | 13 +- Demos/AllBulletDemos/CMakeLists.txt | 3 +- Demos/BasicDemo/BasicDemo.cpp | 50 +- Demos/BasicDemo/CMakeLists.txt | 3 +- Demos/Benchmarks/BenchmarkDemo.cpp | 3 + Demos/Benchmarks/CMakeLists.txt | 6 +- Demos/Box2dDemo/CMakeLists.txt | 142 +- Demos/CMakeLists.txt | 5 - .../ConcaveRaycastDemo/ConcaveRaycastDemo.cpp | 3 +- Demos/MultiThreadedDemo/CMakeLists.txt | 80 +- Demos/MultiThreadedDemo/MultiThreadedDemo.cpp | 76 +- Demos/OpenGL/DemoApplication.cpp | 11 +- Demos/ParticlesOpenCL/NVidia/CMakeLists.txt | 14 +- Demos/ParticlesOpenCL/ParticlesDemo.cpp | 69 +- Demos/ParticlesOpenCL/ParticlesOCL.cl | 11 +- .../btParticlesDemoDynamicsWorld.cpp | 131 +- .../btParticlesDynamicsWorld.h | 1 + Demos/ParticlesOpenCL/shaders.cpp | 35 +- Demos/SharedOpenCL/btOclUtils.cpp | 1 + Demos/ThreadingDemo/main.cpp | 4 +- Demos/VectorAdd_OpenCL/MiniCL_VectorAdd.cpp | 2 +- Demos/VectorAdd_OpenCL/NVidia/CMakeLists.txt | 12 +- .../NarrowPhaseCollision/btManifoldPoint.h | 19 +- .../btPersistentManifold.h | 29 +- .../ConstraintSolver/btSolverConstraint.h | 4 +- src/BulletDynamics/Dynamics/btRigidBody.cpp | 4 +- src/BulletMultiThreaded/CMakeLists.txt | 7 +- src/BulletMultiThreaded/HeapManager.h | 116 + src/BulletMultiThreaded/MiniCL.cpp | 13 +- src/BulletMultiThreaded/PlatformDefinitions.h | 28 +- src/BulletMultiThreaded/PpuAddressSpace.h | 5 +- .../SequentialThreadSupport.cpp | 76 + .../SequentialThreadSupport.h | 6 +- src/BulletMultiThreaded/SpuFakeDma.cpp | 4 + .../SpuNarrowPhaseCollisionTask/Box.h | 32 +- .../SpuContactResult.cpp | 2 +- .../SpuGatheringCollisionTask.cpp | 60 +- .../boxBoxDistance.cpp | 345 +-- .../boxBoxDistance.h | 11 +- src/BulletMultiThreaded/TrbDynBody.h | 78 + src/BulletMultiThreaded/TrbStateVec.h | 334 +++ .../btParallelConstraintSolver.cpp | 1341 ++++++++- .../btParallelConstraintSolver.h | 230 +- .../vectormath/scalar/{cpp => }/boolInVec.h | 0 .../vectormath/scalar/{cpp => }/floatInVec.h | 0 .../vectormath/scalar/{cpp => }/mat_aos.h | 0 .../vectormath/scalar/{cpp => }/quat_aos.h | 0 .../vectormath/scalar/{cpp => }/vec_aos.h | 0 .../scalar/{cpp => }/vectormath_aos.h | 0 .../vectormath/sse/boolInVec.h | 247 ++ .../vectormath/sse/floatInVec.h | 340 +++ .../vectormath/sse/mat_aos.h | 2190 ++++++++++++++ .../vectormath/sse/quat_aos.h | 579 ++++ .../vectormath/sse/vec_aos.h | 1431 ++++++++++ .../vectormath/sse/vecidx_aos.h | 80 + .../vectormath/sse/vectormath_aos.h | 2532 +++++++++++++++++ src/BulletMultiThreaded/vectormath2bullet.h | 7 +- src/CMakeLists.txt | 4 - src/LinearMath/btMinMax.h | 6 +- src/LinearMath/btPoolAllocator.h | 14 + src/LinearMath/btScalar.h | 11 +- src/LinearMath/btVector3.h | 23 +- src/MiniCL/cl_MiniCL_Defs.h | 2 +- 63 files changed, 10363 insertions(+), 522 deletions(-) create mode 100644 src/BulletMultiThreaded/HeapManager.h create mode 100644 src/BulletMultiThreaded/TrbDynBody.h create mode 100644 src/BulletMultiThreaded/TrbStateVec.h rename src/BulletMultiThreaded/vectormath/scalar/{cpp => }/boolInVec.h (100%) rename src/BulletMultiThreaded/vectormath/scalar/{cpp => }/floatInVec.h (100%) rename src/BulletMultiThreaded/vectormath/scalar/{cpp => }/mat_aos.h (100%) rename src/BulletMultiThreaded/vectormath/scalar/{cpp => }/quat_aos.h (100%) rename src/BulletMultiThreaded/vectormath/scalar/{cpp => }/vec_aos.h (100%) rename src/BulletMultiThreaded/vectormath/scalar/{cpp => }/vectormath_aos.h (100%) create mode 100644 src/BulletMultiThreaded/vectormath/sse/boolInVec.h create mode 100644 src/BulletMultiThreaded/vectormath/sse/floatInVec.h create mode 100644 src/BulletMultiThreaded/vectormath/sse/mat_aos.h create mode 100644 src/BulletMultiThreaded/vectormath/sse/quat_aos.h create mode 100644 src/BulletMultiThreaded/vectormath/sse/vec_aos.h create mode 100644 src/BulletMultiThreaded/vectormath/sse/vecidx_aos.h create mode 100644 src/BulletMultiThreaded/vectormath/sse/vectormath_aos.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cd0b9183..49143982f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ -cmake_minimum_required(VERSION 2.4) - +cmake_minimum_required(VERSION 2.4.3) +set(CMAKE_ALLOW_LOOSE_LOOP_CONSTRUCTS true) #this line has to appear before 'PROJECT' in order to be able to disable incremental linking SET(MSVC_INCREMENTAL_DEFAULT ON) @@ -12,10 +12,8 @@ IF (NOT CMAKE_BUILD_TYPE) SET(CMAKE_BUILD_TYPE "Release") ENDIF (NOT CMAKE_BUILD_TYPE) -SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -DDEBUG") +SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_DEBUG") -SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_DEBUG=3") -SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -D_DEBUG=4") MESSAGE("CMAKE_CXX_FLAGS_DEBUG="+${CMAKE_CXX_FLAGS_DEBUG}) OPTION(USE_DOUBLE_PRECISION "Use double precision" OFF) OPTION(USE_GRAPHICAL_BENCHMARK "Use Graphical Benchmark" ON) @@ -28,6 +26,11 @@ ENDIF() OPTION(USE_MSVC_RUNTIME_LIBRARY_DLL "Use MSVC Runtime Library DLL (/MD or /MDd)" ON) OPTION(USE_MSVC_INCREMENTAL_LINKING "Use MSVC Incremental Linking" OFF) +IF(WIN32) +SET (VECTOR_MATH_INCLUDE ${BULLET_PHYSICS_SOURCE_DIR}/src/BulletMultiThreaded/vectormath/sse CACHE PATH "Vector Math library include path.") +ELSE(WIN32) +SET (VECTOR_MATH_INCLUDE ${BULLET_PHYSICS_SOURCE_DIR}/src/BulletMultiThreaded/vectormath/scalar CACHE PATH "Vector Math library include path.") +ENDIF(WIN32) #SET(CMAKE_EXE_LINKER_FLAGS_INIT "/STACK:10000000 /INCREMENTAL:NO") diff --git a/Demos/AllBulletDemos/CMakeLists.txt b/Demos/AllBulletDemos/CMakeLists.txt index a8bcdbdb1..081c9d661 100644 --- a/Demos/AllBulletDemos/CMakeLists.txt +++ b/Demos/AllBulletDemos/CMakeLists.txt @@ -20,10 +20,11 @@ ${BULLET_PHYSICS_SOURCE_DIR}/src ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL ${BULLET_PHYSICS_SOURCE_DIR}/Extras/Serialize/BulletFileLoader ${BULLET_PHYSICS_SOURCE_DIR}/Extras/Serialize/BulletWorldImporter +${VECTOR_MATH_INCLUDE} ) LINK_LIBRARIES( -GLUI GIMPACTUtils ConvexDecomposition OpenGLSupport BulletWorldImporter BulletSoftBody BulletDynamics BulletCollision BulletFileLoader LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} +GLUI GIMPACTUtils ConvexDecomposition BulletMultiThreaded OpenGLSupport BulletWorldImporter BulletSoftBody BulletDynamics BulletCollision BulletFileLoader LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} ) SET(AllBulletDemos_SRCS diff --git a/Demos/BasicDemo/BasicDemo.cpp b/Demos/BasicDemo/BasicDemo.cpp index e945074e1..50566d13c 100644 --- a/Demos/BasicDemo/BasicDemo.cpp +++ b/Demos/BasicDemo/BasicDemo.cpp @@ -33,6 +33,11 @@ subject to the following restrictions: #include "GlutStuff.h" ///btBulletDynamicsCommon.h is the main Bullet include file, contains most common include files. #include "btBulletDynamicsCommon.h" + +#include "BulletMultiThreaded/Win32ThreadSupport.h" +#include "BulletMultiThreaded/btParallelConstraintSolver.h" +#include "BulletCollision/CollisionDispatch/btSimulationIslandManager.h" + #ifdef TEST_SERIALIZATION #include "LinearMath/btSerializer.h" #endif //TEST_SERIALIZATION @@ -50,7 +55,7 @@ void BasicDemo::clientMoveAndDisplay() ///step the simulation if (m_dynamicsWorld) { - m_dynamicsWorld->stepSimulation(ms / 1000000.f); + m_dynamicsWorld->stepSimulation(1./60.,0);//ms / 1000000.f,0); //optional but useful: debug drawing m_dynamicsWorld->debugDrawWorld(); } @@ -79,7 +84,35 @@ void BasicDemo::displayCallback(void) { swapBuffers(); } +btThreadSupportInterface* createSolverThreadSupport(int maxNumThreads) +{ +//#define SEQUENTIAL +#ifdef SEQUENTIAL + SequentialThreadSupport::SequentialThreadConstructionInfo tci("solverThreads",SolverThreadFunc,SolverlsMemoryFunc); + SequentialThreadSupport* threadSupport = new SequentialThreadSupport(tci); + threadSupport->startSPU(); +#else +#ifdef _WIN32 + Win32ThreadSupport::Win32ThreadConstructionInfo threadConstructionInfo("solverThreads",SolverThreadFunc,SolverlsMemoryFunc,maxNumThreads); + Win32ThreadSupport* threadSupport = new Win32ThreadSupport(threadConstructionInfo); + threadSupport->startSPU(); +#elif defined (USE_PTHREADS) + PosixThreadSupport::ThreadConstructionInfo solverConstructionInfo("solver", SolverThreadFunc, + SolverlsMemoryFunc, maxNumThreads); + + PosixThreadSupport* threadSupport = new PosixThreadSupport(solverConstructionInfo); + +#else + SequentialThreadSupport::SequentialThreadConstructionInfo tci("solverThreads",SolverThreadFunc,SolverlsMemoryFunc); + SequentialThreadSupport* threadSupport = new SequentialThreadSupport(tci); + threadSupport->startSPU(); +#endif + +#endif + + return threadSupport; +} @@ -100,10 +133,21 @@ void BasicDemo::initPhysics() m_broadphase = new btDbvtBroadphase(); ///the default constraint solver. For parallel processing you can use a different solver (see Extras/BulletMultiThreaded) - btSequentialImpulseConstraintSolver* sol = new btSequentialImpulseConstraintSolver; + //btSequentialImpulseConstraintSolver* sol = new btSequentialImpulseConstraintSolver; + + //btSequentialImpulseConstraintSolver* sol = new btSequentialImpulseConstraintSolver; + btParallelConstraintSolver* sol = new btParallelConstraintSolver(createSolverThreadSupport(4)); + + + + m_solver = sol; - m_dynamicsWorld = new btDiscreteDynamicsWorld(m_dispatcher,m_broadphase,m_solver,m_collisionConfiguration); + btDiscreteDynamicsWorld* world = new btDiscreteDynamicsWorld(m_dispatcher,m_broadphase,m_solver,m_collisionConfiguration); + + m_dynamicsWorld = world; + world->getSimulationIslandManager()->setSplitIslands(false); + m_dynamicsWorld->setGravity(btVector3(0,-10,0)); diff --git a/Demos/BasicDemo/CMakeLists.txt b/Demos/BasicDemo/CMakeLists.txt index 09568037b..d00423382 100644 --- a/Demos/BasicDemo/CMakeLists.txt +++ b/Demos/BasicDemo/CMakeLists.txt @@ -13,13 +13,14 @@ SET(GLUT_ROOT ${BULLET_PHYSICS_SOURCE_DIR}/Glut) INCLUDE_DIRECTORIES( ${BULLET_PHYSICS_SOURCE_DIR}/src ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL +${VECTOR_MATH_INCLUDE} ) IF (USE_GLUT) LINK_LIBRARIES( - OpenGLSupport BulletDynamics BulletCollision LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} + OpenGLSupport BulletMultiThreaded BulletDynamics BulletCollision LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} ) IF (WIN32) diff --git a/Demos/Benchmarks/BenchmarkDemo.cpp b/Demos/Benchmarks/BenchmarkDemo.cpp index a6e313b90..194e38577 100644 --- a/Demos/Benchmarks/BenchmarkDemo.cpp +++ b/Demos/Benchmarks/BenchmarkDemo.cpp @@ -29,9 +29,12 @@ subject to the following restrictions: #include "Taru.mdl" #include "landscape.mdl" #include "BulletCollision/BroadphaseCollision/btDbvtBroadphase.h" +#ifdef USE_PARALLEL_DISPATCHER_BENCHMARK #include "BulletMultiThreaded/SpuGatheringCollisionDispatcher.h" #include "BulletMultiThreaded/SequentialThreadSupport.h" #include "BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h" +#endif + #include "BulletCollision/CollisionDispatch/btSimulationIslandManager.h" diff --git a/Demos/Benchmarks/CMakeLists.txt b/Demos/Benchmarks/CMakeLists.txt index ab4277875..bcd771fef 100644 --- a/Demos/Benchmarks/CMakeLists.txt +++ b/Demos/Benchmarks/CMakeLists.txt @@ -11,11 +11,12 @@ IF (USE_GRAPHICAL_BENCHMARK) IF (USE_GLUT) INCLUDE_DIRECTORIES( - ${BULLET_PHYSICS_SOURCE_DIR}/src - ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL + ${BULLET_PHYSICS_SOURCE_DIR}/src + ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL ) IF (USE_MULTITHREADED_BENCHMARK) + INCLUDE_DIRECTORIES( ${VECTOR_MATH_INCLUDE} ) LINK_LIBRARIES( OpenGLSupport BulletMultiThreaded BulletDynamics BulletCollision LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}) ELSE() LINK_LIBRARIES( OpenGLSupport BulletDynamics BulletCollision LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}) @@ -65,6 +66,7 @@ ELSE (USE_GLUT) ) IF (USE_MULTITHREADED_BENCHMARK) + INCLUDE_DIRECTORIES( ${VECTOR_MATH_INCLUDE} ) LINK_LIBRARIES( OpenGLSupport BulletMultiThreaded BulletDynamics BulletCollision LinearMath ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} ) diff --git a/Demos/Box2dDemo/CMakeLists.txt b/Demos/Box2dDemo/CMakeLists.txt index ccde7c810..0b434f652 100644 --- a/Demos/Box2dDemo/CMakeLists.txt +++ b/Demos/Box2dDemo/CMakeLists.txt @@ -1,75 +1,75 @@ -# This is basically the overall name of the project in Visual Studio this is the name of the Solution File - - -# For every executable you have with a main method you should have an add_executable line below. -# For every add executable line you should list every .cpp and .h file you have associated with that executable. - - -# This is the variable for Windows. I use this to define the root of my directory structure. -SET(GLUT_ROOT ${BULLET_PHYSICS_SOURCE_DIR}/Glut) - -# You shouldn't have to modify anything below this line -######################################################## - -INCLUDE_DIRECTORIES( -${BULLET_PHYSICS_SOURCE_DIR}/src ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL -) - -IF (USE_GLUT) - LINK_LIBRARIES( - OpenGLSupport BulletDynamics BulletCollision LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} - ) - - IF (WIN32) - ADD_EXECUTABLE(AppBox2dDemo - main.cpp - Box2dDemo.cpp - Box2dDemo.h - ${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc - ) - ELSE() - ADD_EXECUTABLE(AppBox2dDemo - main.cpp - Box2dDemo.cpp - Box2dDemo.h - ) - ENDIF() - IF (WIN32) - IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES) - IF (CMAKE_CL_64) - ADD_CUSTOM_COMMAND( - TARGET AppBox2dDemo - POST_BUILD - COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR} - ) - ELSE(CMAKE_CL_64) - ADD_CUSTOM_COMMAND( - TARGET AppBox2dDemo - POST_BUILD - COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR} - ) - ENDIF(CMAKE_CL_64) - ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES) - - ENDIF(WIN32) -ELSE (USE_GLUT) - LINK_LIBRARIES( - OpenGLSupport BulletDynamics BulletCollision LinearMath ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} - ) - - ADD_EXECUTABLE(AppBox2dDemo - WIN32 - ../OpenGL/Win32AppMain.cpp - Win32Box2dDemo.cpp - Box2dDemo.cpp - Box2dDemo.h - ${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc - ) -ENDIF (USE_GLUT) - +# This is basically the overall name of the project in Visual Studio this is the name of the Solution File + + +# For every executable you have with a main method you should have an add_executable line below. +# For every add executable line you should list every .cpp and .h file you have associated with that executable. + + +# This is the variable for Windows. I use this to define the root of my directory structure. +SET(GLUT_ROOT ${BULLET_PHYSICS_SOURCE_DIR}/Glut) + +# You shouldn't have to modify anything below this line +######################################################## + +INCLUDE_DIRECTORIES( +${BULLET_PHYSICS_SOURCE_DIR}/src ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL +) + +IF (USE_GLUT) + LINK_LIBRARIES( + OpenGLSupport BulletDynamics BulletCollision LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} + ) + + IF (WIN32) + ADD_EXECUTABLE(AppBox2dDemo + main.cpp + Box2dDemo.cpp + Box2dDemo.h + ${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc + ) + ELSE() + ADD_EXECUTABLE(AppBox2dDemo + main.cpp + Box2dDemo.cpp + Box2dDemo.h + ) + ENDIF() + IF (WIN32) + IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES) + IF (CMAKE_CL_64) + ADD_CUSTOM_COMMAND( + TARGET AppBox2dDemo + POST_BUILD + COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR} + ) + ELSE(CMAKE_CL_64) + ADD_CUSTOM_COMMAND( + TARGET AppBox2dDemo + POST_BUILD + COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR} + ) + ENDIF(CMAKE_CL_64) + ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES) + + ENDIF(WIN32) +ELSE (USE_GLUT) + LINK_LIBRARIES( + OpenGLSupport BulletDynamics BulletCollision LinearMath ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} + ) + + ADD_EXECUTABLE(AppBox2dDemo + WIN32 + ../OpenGL/Win32AppMain.cpp + Win32Box2dDemo.cpp + Box2dDemo.cpp + Box2dDemo.h + ${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc + ) +ENDIF (USE_GLUT) + IF (INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES) SET_TARGET_PROPERTIES(AppBox2dDemo PROPERTIES DEBUG_POSTFIX "_Debug") SET_TARGET_PROPERTIES(AppBox2dDemo PROPERTIES MINSIZEREL_POSTFIX "_MinsizeRel") SET_TARGET_PROPERTIES(AppBox2dDemo PROPERTIES RELWITHDEBINFO_POSTFIX "_RelWithDebugInfo") -ENDIF(INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES) - +ENDIF(INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES) + diff --git a/Demos/CMakeLists.txt b/Demos/CMakeLists.txt index 450d1f783..979bc8180 100644 --- a/Demos/CMakeLists.txt +++ b/Demos/CMakeLists.txt @@ -18,17 +18,12 @@ ELSE() ) ENDIF() -if (CMAKE_SIZEOF_VOID_P MATCHES "8") - SUBDIRS( ${SharedDemoSubdirs} - ) -else (CMAKE_SIZEOF_VOID_P MATCHES "8") SUBDIRS( ${SharedDemoSubdirs} ThreadingDemo MultiThreadedDemo VectorAdd_OpenCL ParticlesOpenCL ) -endif (CMAKE_SIZEOF_VOID_P MATCHES "8") ELSE (USE_GLUT) diff --git a/Demos/ConcaveRaycastDemo/ConcaveRaycastDemo.cpp b/Demos/ConcaveRaycastDemo/ConcaveRaycastDemo.cpp index d1abd1c0c..c272517ba 100644 --- a/Demos/ConcaveRaycastDemo/ConcaveRaycastDemo.cpp +++ b/Demos/ConcaveRaycastDemo/ConcaveRaycastDemo.cpp @@ -25,7 +25,7 @@ subject to the following restrictions: #ifdef BATCH_RAYCASTER #include "BulletMultiThreaded/SpuBatchRaycaster.h" static SpuBatchRaycaster* gBatchRaycaster = NULL; -#endif + #ifdef USE_LIBSPE2 #include "BulletMultiThreaded/SpuLibspe2Support.h" @@ -35,6 +35,7 @@ static SpuBatchRaycaster* gBatchRaycaster = NULL; //other platforms run the parallel code sequentially (until pthread support or other parallel implementation is added) #include "BulletMultiThreaded/SequentialThreadSupport.h" #endif //USE_LIBSPE2 +#endif //BATCH_RAYCASTER static btVector3* gVertices=0; static int* gIndices=0; diff --git a/Demos/MultiThreadedDemo/CMakeLists.txt b/Demos/MultiThreadedDemo/CMakeLists.txt index 0dbda097c..063e96ea1 100644 --- a/Demos/MultiThreadedDemo/CMakeLists.txt +++ b/Demos/MultiThreadedDemo/CMakeLists.txt @@ -1,33 +1,53 @@ -# This is basically the overall name of the project in Visual Studio this is the name of the Solution File - - -# For every executable you have with a main method you should have an add_executable line below. -# For every add executable line you should list every .cpp and .h file you have associated with that executable. - - - -# You shouldn't have to modify anything below this line -######################################################## - - -INCLUDE_DIRECTORIES( -${BULLET_PHYSICS_SOURCE_DIR}/src ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL -) - -LINK_LIBRARIES( -OpenGLSupport BulletMultiThreaded BulletDynamics BulletCollision LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} -) - -ADD_EXECUTABLE(AppMultiThreadedDemo - main.cpp - MultiThreadedDemo.cpp - MultiThreadedDemo.h -) -IF (UNIX) - TARGET_LINK_LIBRARIES(AppMultiThreadedDemo pthread) -ENDIF(UNIX) - - +# This is basically the overall name of the project in Visual Studio this is the name of the Solution File + + +# For every executable you have with a main method you should have an add_executable line below. +# For every add executable line you should list every .cpp and .h file you have associated with that executable. + + + +# You shouldn't have to modify anything below this line +######################################################## + + +INCLUDE_DIRECTORIES( +${BULLET_PHYSICS_SOURCE_DIR}/src +${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL +${VECTOR_MATH_INCLUDE} +) + +LINK_LIBRARIES( +OpenGLSupport BulletMultiThreaded BulletDynamics BulletCollision LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY} +) + +ADD_EXECUTABLE(AppMultiThreadedDemo + main.cpp + MultiThreadedDemo.cpp + MultiThreadedDemo.h +) +IF (UNIX) + TARGET_LINK_LIBRARIES(AppMultiThreadedDemo pthread) +ENDIF(UNIX) + +IF(WIN32) +IF (CMAKE_CL_64) + IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES) + ADD_CUSTOM_COMMAND( TARGET AppMultiThreadedDemo POST_BUILD + COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR} ) + ADD_CUSTOM_COMMAND( TARGET AppMultiThreadedDemo POST_BUILD + COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW64.DLL ${CMAKE_CURRENT_BINARY_DIR}) + ENDIF() +ELSE(CMAKE_CL_64) + IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES) + ADD_CUSTOM_COMMAND( TARGET AppMultiThreadedDemo POST_BUILD + COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR}) + ADD_CUSTOM_COMMAND( TARGET AppMultiThreadedDemo POST_BUILD + COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW32.DLL ${CMAKE_CURRENT_BINARY_DIR}) + ENDIF() +ENDIF(CMAKE_CL_64) +ENDIF(WIN32) + + IF (INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES) SET_TARGET_PROPERTIES(AppMultiThreadedDemo PROPERTIES DEBUG_POSTFIX "_Debug") SET_TARGET_PROPERTIES(AppMultiThreadedDemo PROPERTIES MINSIZEREL_POSTFIX "_MinsizeRel") diff --git a/Demos/MultiThreadedDemo/MultiThreadedDemo.cpp b/Demos/MultiThreadedDemo/MultiThreadedDemo.cpp index 4fff696df..5f5c437bc 100644 --- a/Demos/MultiThreadedDemo/MultiThreadedDemo.cpp +++ b/Demos/MultiThreadedDemo/MultiThreadedDemo.cpp @@ -13,12 +13,13 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ -//#define USE_PARALLEL_SOLVER 1 //experimental parallel solver +#define USE_PARALLEL_SOLVER 1 //experimental parallel solver #define USE_PARALLEL_DISPATCHER 1 #include "btBulletDynamicsCommon.h" #include "BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h" #include "BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.h" +#include "BulletCollision/CollisionDispatch/btSimulationIslandManager.h" #ifdef USE_PARALLEL_DISPATCHER #include "BulletMultiThreaded/SpuGatheringCollisionDispatcher.h" @@ -37,18 +38,52 @@ subject to the following restrictions: #else //other platforms run the parallel code sequentially (until pthread support or other parallel implementation is added) -#include "BulletMultiThreaded/SequentialThreadSupport.h" + #include "BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h" #endif //USE_LIBSPE2 #ifdef USE_PARALLEL_SOLVER -#include "BulletMultiThreaded/SpuParallelSolver.h" -#include "BulletMultiThreaded/SpuSolverTask/SpuParallellSolverTask.h" +#include "BulletMultiThreaded/btParallelConstraintSolver.h" +#include "BulletMultiThreaded/SequentialThreadSupport.h" + + +btThreadSupportInterface* createSolverThreadSupport(int maxNumThreads) +{ +//#define SEQUENTIAL +#ifdef SEQUENTIAL + SequentialThreadSupport::SequentialThreadConstructionInfo tci("solverThreads",SolverThreadFunc,SolverlsMemoryFunc); + SequentialThreadSupport* threadSupport = new SequentialThreadSupport(tci); + threadSupport->startSPU(); +#else + +#ifdef _WIN32 + Win32ThreadSupport::Win32ThreadConstructionInfo threadConstructionInfo("solverThreads",SolverThreadFunc,SolverlsMemoryFunc,maxNumThreads); + Win32ThreadSupport* threadSupport = new Win32ThreadSupport(threadConstructionInfo); + threadSupport->startSPU(); +#elif defined (USE_PTHREADS) + PosixThreadSupport::ThreadConstructionInfo solverConstructionInfo("solver", SolverThreadFunc, + SolverlsMemoryFunc, maxNumThreads); + + PosixThreadSupport* threadSupport = new PosixThreadSupport(solverConstructionInfo); + +#else + SequentialThreadSupport::SequentialThreadConstructionInfo tci("solverThreads",SolverThreadFunc,SolverlsMemoryFunc); + SequentialThreadSupport* threadSupport = new SequentialThreadSupport(tci); + threadSupport->startSPU(); +#endif + +#endif + + return threadSupport; +} + #endif //USE_PARALLEL_SOLVER #endif//USE_PARALLEL_DISPATCHER + + #include "LinearMath/btQuickprof.h" #include "LinearMath/btIDebugDraw.h" @@ -162,8 +197,6 @@ void MultiThreadedDemo::clientMoveAndDisplay() int numSimSteps = 0; numSimSteps = m_dynamicsWorld->stepSimulation(dt,maxSimSubSteps); - //optional but useful: debug drawing - m_dynamicsWorld->debugDrawWorld(); #ifdef VERBOSE_TIMESTEPPING_CONSOLEOUTPUT if (!numSimSteps) @@ -182,6 +215,9 @@ void MultiThreadedDemo::clientMoveAndDisplay() #endif //VERBOSE_TIMESTEPPING_CONSOLEOUTPUT #endif + //optional but useful: debug drawing + m_dynamicsWorld->debugDrawWorld(); + } #ifdef USE_QUICKPROF @@ -321,41 +357,23 @@ m_threadSupportCollision = new Win32ThreadSupport(Win32ThreadSupport::Win32Threa #ifdef USE_PARALLEL_SOLVER - -#ifdef USE_WIN32_THREADING - m_threadSupportSolver = new Win32ThreadSupport(Win32ThreadSupport::Win32ThreadConstructionInfo( - "solver", - processSolverTask, - createSolverLocalStoreMemory, - maxNumOutstandingTasks)); -#elif defined (USE_PTHREADS) - PosixThreadSupport::ThreadConstructionInfo solverConstructionInfo("solver", processSolverTask, - createSolverLocalStoreMemory, maxNumOutstandingTasks); - - m_threadSupportSolver = new PosixThreadSupport(solverConstructionInfo); -#else - //for now use sequential version - SequentialThreadSupport::SequentialThreadConstructionInfo solverCI("solver",processSolverTask,createSolverLocalStoreMemory); - m_threadSupportSolver = new SequentialThreadSupport(solverCI); - -#endif //USE_WIN32_THREADING - m_solver = new btParallelSequentialImpulseSolver(m_threadSupportSolver,maxNumOutstandingTasks); - + m_threadSupportSolver = createSolverThreadSupport(maxNumOutstandingTasks); + m_solver = new btParallelConstraintSolver(m_threadSupportSolver); #else btSequentialImpulseConstraintSolver* solver = new btSequentialImpulseConstraintSolver(); - m_solver = solver; //default solverMode is SOLVER_RANDMIZE_ORDER. Warmstarting seems not to improve convergence, see //solver->setSolverMode(0);//btSequentialImpulseConstraintSolver::SOLVER_USE_WARMSTARTING | btSequentialImpulseConstraintSolver::SOLVER_RANDMIZE_ORDER); - #endif //USE_PARALLEL_SOLVER + btDiscreteDynamicsWorld* world = new btDiscreteDynamicsWorld(m_dispatcher,m_broadphase,m_solver,m_collisionConfiguration); m_dynamicsWorld = world; + world->getSimulationIslandManager()->setSplitIslands(false); world->getSolverInfo().m_numIterations = 4; - world->getSolverInfo().m_solverMode = SOLVER_SIMD+SOLVER_USE_WARMSTARTING; + world->getSolverInfo().m_solverMode = SOLVER_SIMD+SOLVER_USE_WARMSTARTING;//+SOLVER_RANDMIZE_ORDER; m_dynamicsWorld->getDispatchInfo().m_enableSPU = true; m_dynamicsWorld->setGravity(btVector3(0,-10,0)); diff --git a/Demos/OpenGL/DemoApplication.cpp b/Demos/OpenGL/DemoApplication.cpp index 121381ca7..b09680d0a 100644 --- a/Demos/OpenGL/DemoApplication.cpp +++ b/Demos/OpenGL/DemoApplication.cpp @@ -954,8 +954,8 @@ void DemoApplication::mouseMotionFunc(int x,int y) { btVector3 hor = getRayTo(0,0)-getRayTo(1,0); btVector3 vert = getRayTo(0,0)-getRayTo(0,1); - btScalar multiplierX = btScalar(0.01); - btScalar multiplierY = btScalar(0.01); + btScalar multiplierX = btScalar(0.001); + btScalar multiplierY = btScalar(0.001); if (m_ortho) { multiplierX = 1; @@ -979,7 +979,7 @@ void DemoApplication::mouseMotionFunc(int x,int y) } else if(m_mouseButtons & 4) { - m_cameraDistance -= dy * btScalar(0.2f); + m_cameraDistance -= dy * btScalar(0.02f); if (m_cameraDistancesetActivationState(WANTS_DEACTIVATION); } //removed cached contact points (this is not necessary if all objects have been removed from the dynamics world) - //m_dynamicsWorld->getBroadphase()->getOverlappingPairCache()->cleanProxyFromPairs(colObj->getBroadphaseHandle(),getDynamicsWorld()->getDispatcher()); + if (m_dynamicsWorld->getBroadphase()->getOverlappingPairCache()) + m_dynamicsWorld->getBroadphase()->getOverlappingPairCache()->cleanProxyFromPairs(colObj->getBroadphaseHandle(),getDynamicsWorld()->getDispatcher()); btRigidBody* body = btRigidBody::upcast(colObj); if (body && !body->isStaticObject()) diff --git a/Demos/ParticlesOpenCL/NVidia/CMakeLists.txt b/Demos/ParticlesOpenCL/NVidia/CMakeLists.txt index 87f473e1d..3b17c49ed 100644 --- a/Demos/ParticlesOpenCL/NVidia/CMakeLists.txt +++ b/Demos/ParticlesOpenCL/NVidia/CMakeLists.txt @@ -10,14 +10,14 @@ ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL IF(INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES) INCLUDE_DIRECTORIES( $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/inc ) IF (CMAKE_CL_64) - SET(CMAK_NVSDKCOMPUTE_LIBPATH ) + SET(CMAK_NVSDKCOMPUTE_LIBPATH $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/x64 ) ELSE(CMAKE_CL_64) - SET(CMAK_NVSDKCOMPUTE_LIBPATH $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/x64 ) + SET(CMAK_NVSDKCOMPUTE_LIBPATH $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/Win32 ) ENDIF(CMAKE_CL_64) ELSE() INCLUDE_DIRECTORIES( $ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/inc ) IF (CMAKE_CL_64) - SET(CMAK_NVSDKCOMPUTE_LIBPATH ) + SET(CMAK_NVSDKCOMPUTE_LIBPATH $ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/lib/x64 ) ELSE(CMAKE_CL_64) SET(CMAK_NVSDKCOMPUTE_LIBPATH $ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/lib/Win32 ) ENDIF(CMAKE_CL_64) @@ -35,7 +35,6 @@ IF (USE_GLUT) OpenGLSupport BulletDynamics BulletCollision - BulletMultiThreaded LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} @@ -74,8 +73,6 @@ IF (CMAKE_CL_64) COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR} ) ADD_CUSTOM_COMMAND( TARGET AppParticlesOCL_Nv POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW64.DLL ${CMAKE_CURRENT_BINARY_DIR}) - ADD_CUSTOM_COMMAND( TARGET AppParticlesOCL_Nv POST_BUILD - COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesOCL.cl ${CMAKE_CURRENT_BINARY_DIR}) ENDIF() ELSE(CMAKE_CL_64) IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES) @@ -83,10 +80,13 @@ ELSE(CMAKE_CL_64) COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR}) ADD_CUSTOM_COMMAND( TARGET AppParticlesOCL_Nv POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW32.DLL ${CMAKE_CURRENT_BINARY_DIR}) + ENDIF() +ENDIF(CMAKE_CL_64) + + IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES) ADD_CUSTOM_COMMAND( TARGET AppParticlesOCL_Nv POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesOCL.cl ${CMAKE_CURRENT_BINARY_DIR}) ENDIF() -ENDIF(CMAKE_CL_64) ENDIF(WIN32) IF (UNIX) diff --git a/Demos/ParticlesOpenCL/ParticlesDemo.cpp b/Demos/ParticlesOpenCL/ParticlesDemo.cpp index 6a789a2d3..331c73c22 100644 --- a/Demos/ParticlesOpenCL/ParticlesDemo.cpp +++ b/Demos/ParticlesOpenCL/ParticlesDemo.cpp @@ -22,8 +22,9 @@ subject to the following restrictions: //#define START_POS_Z btScalar(0.4f) #define ARRAY_SIZE_X 32 #define ARRAY_SIZE_Y 32 -//#define ARRAY_SIZE_Y 5 +//#define ARRAY_SIZE_Y 16 #define ARRAY_SIZE_Z 16 +//16 //#define ARRAY_SIZE_Z 1 //#define DIST btScalar(2.f) #define DIST (DEF_PARTICLE_RADIUS * 2.f) @@ -65,7 +66,7 @@ subject to the following restrictions: -btScalar gTimeStep = btScalar(1./60.); +btScalar gTimeStep = 0.5f;//btScalar(1./60.); #define SCALING btScalar(1.f) @@ -254,6 +255,7 @@ void ParticlesDemo::initPhysics() { // btCollisionShape* colShape = new btSphereShape(btScalar(1.0f)); +/* btCollisionShape* colShape = new btSphereShape(DEF_PARTICLE_RADIUS); m_collisionShapes.push_back(colShape); btTransform startTransform; @@ -269,37 +271,53 @@ void ParticlesDemo::initPhysics() rbInfo.m_startWorldTransform = startTransform; btRigidBody* body = new btRigidBody(rbInfo); m_pWorld->addRigidBody(body); + */ + init_scene_directly(); } clientResetScene(); m_pWorld->initDeviceData(); } -static float frand(void) { return 2.0f * (float)rand()/(float)RAND_MAX - 1.0f; } +inline float frand(void){ + return (float)rand() / (float)RAND_MAX; +} void ParticlesDemo::init_scene_directly() { - float start_x = START_POS_X - ARRAY_SIZE_X * DIST * btScalar(0.5f); - float start_y = START_POS_Y - ARRAY_SIZE_Y * DIST * btScalar(0.5f); - float start_z = START_POS_Z - ARRAY_SIZE_Z * DIST * btScalar(0.5f); - int total = ARRAY_SIZE_X * ARRAY_SIZE_Y * ARRAY_SIZE_Z; - m_pWorld->m_hPos.resize(total); - m_pWorld->m_hVel.resize(total); - total = 0; - for (int k=0;km_hVel[total] = btVector3(0., 0., 0.); - btVector3 jitter = 0.01f * 0.03f * btVector3(frand(), frand(), frand()); - m_pWorld->m_hPos[total] = btVector3(DIST*i + start_x, DIST*k + start_y, DIST*j + start_z) + jitter; - total++; - } - } - } - m_pWorld->m_numParticles = total; + + + srand(1969); + float start_x = -1+DEF_PARTICLE_RADIUS;//START_POS_X - ARRAY_SIZE_X * DIST * btScalar(0.5f); + float start_y = -1+DEF_PARTICLE_RADIUS;//START_POS_Y - ARRAY_SIZE_Y * DIST * btScalar(0.5f); + float start_z = -1+DEF_PARTICLE_RADIUS;//START_POS_Z - ARRAY_SIZE_Z * DIST * btScalar(0.5f); + int numParticles = ARRAY_SIZE_X * ARRAY_SIZE_Y * ARRAY_SIZE_Z; + m_pWorld->m_hPos.resize(numParticles); + m_pWorld->m_hVel.resize(numParticles); + + btScalar spacing = 2 * DEF_PARTICLE_RADIUS; + + for(int z=0; zm_hVel[i]= btVector3(0,0,0); + m_pWorld->m_hPos[i].setX((spacing * x) + DEF_PARTICLE_RADIUS -WORLD_SIZE+jitter.getX()); + m_pWorld->m_hPos[i].setY((spacing * y) + DEF_PARTICLE_RADIUS -WORLD_SIZE+jitter.getY()); + m_pWorld->m_hPos[i].setZ((spacing * z) + DEF_PARTICLE_RADIUS -WORLD_SIZE+jitter.getZ()); + } + } + } + } + + m_pWorld->m_numParticles = numParticles; + } @@ -314,6 +332,7 @@ void ParticlesDemo::clientResetScene() } else { + m_pWorld->grabSimulationData(); } } @@ -392,7 +411,7 @@ void ParticlesDemo::renderme() { glColor3f(1.0, 1.0, 1.0); - glutWireCube(2.0); + glutWireCube(2*WORLD_SIZE); glPointSize(5.0f); glEnable(GL_POINT_SPRITE_ARB); diff --git a/Demos/ParticlesOpenCL/ParticlesOCL.cl b/Demos/ParticlesOpenCL/ParticlesOCL.cl index 16407cedc..2ddf9cd9f 100644 --- a/Demos/ParticlesOpenCL/ParticlesOCL.cl +++ b/Demos/ParticlesOpenCL/ParticlesOCL.cl @@ -81,7 +81,7 @@ __kernel void kFindCellStart( int numParticles, __global float4* pSortedVel GUID_ARG) { int index = get_global_id(0); - __local int sharedHash[513]; + __local int sharedHash[1025];//maximum workgroup size 1024 int2 sortedData; if(index < numParticles) @@ -142,6 +142,8 @@ __kernel void kIntegrateMotion( int numParticles, // collide with world boundaries float4 worldMin = *((__global float4*)(pParams + 1)); float4 worldMax = *((__global float4*)(pParams + 2)); + + if(pos.x < (worldMin.x + particleRad)) { pos.x = worldMin.x + particleRad; @@ -216,7 +218,6 @@ float4 collideTwoParticles( } - __kernel void kCollideParticles(int numParticles, __global float4* pVel, //output: new velocity __global const float4* pSortedPos, //input: reordered positions @@ -264,8 +265,10 @@ __kernel void kCollideParticles(int numParticles, continue; } //Iterate over particles in this cell - int endI = startI + 8; - if(endI >= numParticles) endI = numParticles - 1; + int endI = startI + 32; + if(endI >= numParticles) + endI = numParticles ; + for(int j = startI; j < endI; j++) { uint hashC = pPosHash[j].x; diff --git a/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp b/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp index 766e54a9a..2f8ca5e6a 100644 --- a/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp +++ b/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp @@ -74,6 +74,7 @@ int btParticlesDynamicsWorld::stepSimulation( btScalar timeStep, int maxSubSteps } { BT_PROFILE("CollideParticles"); +// printf("\ncollide particles\n\n"); runCollideParticlesKernel(); } gStepNum++; @@ -212,8 +213,8 @@ void btParticlesDynamicsWorld::adjustGrid() m_worldMin -= wsize; m_worldMax += wsize; */ - m_worldMin.setValue(-1.f, -1.f, -1.f); - m_worldMax.setValue( 1.f, 1.f, 1.f); + m_worldMin.setValue(-WORLD_SIZE, -WORLD_SIZE, -WORLD_SIZE); + m_worldMax.setValue( WORLD_SIZE, WORLD_SIZE, WORLD_SIZE); wsize = m_worldMax - m_worldMin; m_cellSize[0] = m_cellSize[1] = m_cellSize[2] = m_particleRad * btScalar(2.f); @@ -246,7 +247,12 @@ void btParticlesDynamicsWorld::adjustGrid() void btParticlesDynamicsWorld::grabSimulationData() { // const btVector3& gravity = getGravity(); - btVector3 gravity(0., -0.06, 0.); + //btVector3 gravity(0., -0.06, 0.); + //btVector3 gravity(0., -0.0003f, 0.); + btVector3 gravity(0,-0.0003,0); + + + m_simParams.m_gravity[0] = gravity[0]; m_simParams.m_gravity[1] = gravity[1]; m_simParams.m_gravity[2] = gravity[2]; @@ -258,7 +264,7 @@ void btParticlesDynamicsWorld::grabSimulationData() // m_simParams.m_spring = 0.5f; // m_simParams.m_shear = 0.1f; // m_simParams.m_attraction = 0.0f; - m_simParams.m_collisionDamping = 0.02f; + m_simParams.m_collisionDamping = 0.025f;//0.02f; m_simParams.m_spring = 0.5f; m_simParams.m_shear = 0.1f; m_simParams.m_attraction = 0.0f; @@ -427,6 +433,7 @@ void btParticlesDynamicsWorld::initCLKernels(int argc, char** argv) ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 1, sizeof(cl_mem), (void*) &m_dCellStart); initKernel(PARTICLES_KERNEL_FIND_CELL_START, "kFindCellStart"); +// ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 0, sizeof(int), (void*) &m_numParticles); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPosHash); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 2, sizeof(cl_mem), (void*) &m_dCellStart); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 3, sizeof(cl_mem), (void*) &m_dPos); @@ -560,15 +567,28 @@ static btVector3 cpu_collideTwoParticles( float relVelDotNorm = relVel.dot(norm); btVector3 tanVel = relVel - relVelDotNorm * norm; //Spring force (potential) - float springFactor = -spring * (collideDist - dist); - force = springFactor * norm + damping * relVel + shear * tanVel + attraction * relPos; + //float springFactor = -spring * (collideDist - dist); + float springFactor = -0.4 * (collideDist - dist); + force = springFactor * norm + damping * relVel;// + shear * tanVel + attraction * relPos; } return force; } +struct btPair +{ + union + { + int value; + short v0[2]; + }; +}; void btParticlesDynamicsWorld::runCollideParticlesKernel() { + btAlignedObjectArray pairs; + + float particleRad = m_simParams.m_particleRad; + float collideDist2 = (particleRad + particleRad)*(particleRad + particleRad); cl_int ciErrNum; if(m_useCpuControls[SIMSTAGE_COLLIDE_PARTICLES]->m_active) { // CPU version @@ -585,6 +605,7 @@ void btParticlesDynamicsWorld::runCollideParticlesKernel() ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dCellStart, CL_TRUE, 0, memSize, &(m_hCellStart[0]), 0, NULL, NULL); oclCHECKERROR(ciErrNum, CL_SUCCESS); } + for(int index = 0; index < m_numParticles; index++) { btVector3 posA = m_hSortedPos[index]; @@ -618,7 +639,10 @@ void btParticlesDynamicsWorld::runCollideParticlesKernel() continue; } //Iterate over particles in this cell - int endI = startI + 8; + int endI = startI + 32; + if(endI > m_numParticles) + endI = m_numParticles; + for(int j = startI; j < endI; j++) { unsigned int hashC = m_hPosHash[j].x; @@ -630,6 +654,14 @@ void btParticlesDynamicsWorld::runCollideParticlesKernel() { continue; } + + btPair pair; + pair.v0[0] = index; + pair.v0[1] = j; + pairs.push_back(pair.value); + +// printf("index=%d, j=%d\n",index,j); +// printf("(index=%d, j=%d) ",index,j); btVector3 posB = m_hSortedPos[j]; btVector3 velB = m_hSortedVel[j]; //Collide two spheres @@ -642,6 +674,56 @@ void btParticlesDynamicsWorld::runCollideParticlesKernel() //Write new velocity back to original unsorted location m_hVel[unsortedIndex] = velA + force; } + +//#define BRUTE_FORCE_CHECK 1 +#ifdef BRUTE_FORCE_CHECK + for(int index = 0; index < m_numParticles; index++) + { + btVector3 posA = m_hSortedPos[index]; + btVector3 velA = m_hSortedVel[index]; + btVector3 force = btVector3(0, 0, 0); + int unsortedIndex = m_hPosHash[index].y; + + float collisionDamping = m_simParams.m_collisionDamping; + float spring = m_simParams.m_spring; + float shear = m_simParams.m_shear; + float attraction = m_simParams.m_attraction; + for(int j = 0 ; j < m_numParticles; j++) + { + if (index!=j) + { + btVector3 posB = m_hSortedPos[j]; + btVector3 velB = m_hSortedVel[j]; + + + btVector3 relPos = posB - posA; relPos[3] = 0.f; + float dist2 = (relPos[0] * relPos[0] + relPos[1] * relPos[1] + relPos[2] * relPos[2]); + + + + if(dist2 < collideDist2) + { + //Collide two spheres + // force += cpu_collideTwoParticles( posA, posB, velA, velB, particleRad, particleRad, + // spring, collisionDamping, shear, attraction); + + btPair pair; + pair.v0[0] = index; + pair.v0[1] = j; + if (pairs.findLinearSearch(pair.value)==pairs.size()) + { + printf("not found index=%d, j=%d\n",index,j); + } + + + } + } + } + //Write new velocity back to original unsorted location + //m_hVel[unsortedIndex] = velA + force; + } +#endif //BRUTE_FORCE_CHECK + memSize = sizeof(btVector3) * m_numParticles; ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL); oclCHECKERROR(ciErrNum, CL_SUCCESS); @@ -675,7 +757,11 @@ void btParticlesDynamicsWorld::runIntegrateMotionKernel() pos[3] = 1.0f; vel[3] = 0.0f; // apply gravity - btVector3 gravity = *((btVector3*)(m_simParams.m_gravity)); + btVector3 gravity; + gravity[0] = m_simParams.m_gravity[0]; + gravity[1] = m_simParams.m_gravity[1]; + gravity[2] = m_simParams.m_gravity[2]; + float particleRad = m_simParams.m_particleRad; float globalDamping = m_simParams.m_globalDamping; float boundaryDamping = m_simParams.m_boundaryDamping; @@ -684,8 +770,16 @@ void btParticlesDynamicsWorld::runIntegrateMotionKernel() // integrate position pos += vel * m_timeStep; // collide with world boundaries - btVector3 worldMin = *((btVector3*)(m_simParams.m_worldMin)); - btVector3 worldMax = *((btVector3*)(m_simParams.m_worldMax)); + btVector3 worldMin; + worldMin[0] = m_simParams.m_worldMin[0]; + worldMin[1] = m_simParams.m_worldMin[1]; + worldMin[2] = m_simParams.m_worldMin[2]; + + btVector3 worldMax; + worldMax[0] = m_simParams.m_worldMax[0]; + worldMax[1] = m_simParams.m_worldMax[1]; + worldMax[2] = m_simParams.m_worldMax[2]; + for(int j = 0; j < 3; j++) { if(pos[j] < (worldMin[j] + particleRad)) @@ -800,7 +894,7 @@ void btParticlesDynamicsWorld::runSortHashKernel() } }; btHashPosKey* pHash = (btHashPosKey*)(&m_hPosHash[0]); - pHash->quickSort(pHash, 0, m_numParticles - 1); + pHash->quickSort(pHash, 0, m_numParticles ); // pHash->bitonicSort(pHash, 0, m_hashSize, true); // write back to GPU ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL); @@ -899,8 +993,19 @@ void btParticlesDynamicsWorld::initKernel(int kernelId, char* pName) size_t wgSize; ciErrNum = clGetKernelWorkGroupInfo(kernel, m_cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL); oclCHECKERROR(ciErrNum, CL_SUCCESS); -// if (wgSize > 64) -// wgSize = 64; + + + + +// if (wgSize > 256) +// wgSize = 256; + + if (wgSize > 512) + wgSize = 512; + +// if (wgSize > 1024) +// wgSize = 1024; + m_kernels[kernelId].m_Id = kernelId; m_kernels[kernelId].m_kernel = kernel; m_kernels[kernelId].m_name = pName; diff --git a/Demos/ParticlesOpenCL/btParticlesDynamicsWorld.h b/Demos/ParticlesOpenCL/btParticlesDynamicsWorld.h index 8a3be82f7..0076b651e 100644 --- a/Demos/ParticlesOpenCL/btParticlesDynamicsWorld.h +++ b/Demos/ParticlesOpenCL/btParticlesDynamicsWorld.h @@ -45,6 +45,7 @@ subject to the following restrictions: #define PARTICLES_MAX_PARTICLES (65536) #define PARTICLES_MAX_NEIGHBORS (32) #define DEF_PARTICLE_RADIUS (0.023f) +#define WORLD_SIZE 1.9f enum { diff --git a/Demos/ParticlesOpenCL/shaders.cpp b/Demos/ParticlesOpenCL/shaders.cpp index eea1f8ade..59f028c97 100644 --- a/Demos/ParticlesOpenCL/shaders.cpp +++ b/Demos/ParticlesOpenCL/shaders.cpp @@ -1,5 +1,6 @@ #define STRINGIFY(A) #A +#ifdef USE_AMD_OPENCL // vertex shader const char *vertexShader = STRINGIFY( uniform float pointRadius; // point size in world space @@ -17,16 +18,40 @@ void main() //hack around latest AMD graphics cards having troubles with point sprite rendering //the problem is still unresolved on the 5870 card, and results in a black screen //see also http://forums.amd.com/devforum/messageview.cfm?catid=392&threadid=129431 -#ifdef USE_AMD_OPENCL gl_TexCoord = gl_MultiTexCoord0; -#else - gl_TexCoord[0] = gl_MultiTexCoord0; -#endif - gl_Position = gl_ModelViewProjectionMatrix * vec4(gl_Vertex.xyz, 1.0); + + gl_Position = gl_ModelViewProjectionMatrix * vec4(gl_Vertex.xyz, 1.0); gl_FrontColor = gl_Color; } ); +#else +// vertex shader +const char *vertexShader = STRINGIFY( +uniform float pointRadius; // point size in world space +uniform float pointScale; // scale to calculate size in pixels +uniform float densityScale; +uniform float densityOffset; +varying vec3 posEye; +void main() +{ + // calculate window-space point size + posEye = vec3(gl_ModelViewMatrix * vec4(gl_Vertex.xyz, 1.0)); + float dist = length(posEye); + gl_PointSize = pointRadius * (pointScale / dist); +// gl_PointSize = 4.0; +//hack around latest AMD graphics cards having troubles with point sprite rendering +//the problem is still unresolved on the 5870 card, and results in a black screen +//see also http://forums.amd.com/devforum/messageview.cfm?catid=392&threadid=129431 + gl_TexCoord[0] = gl_MultiTexCoord0; + + gl_Position = gl_ModelViewProjectionMatrix * vec4(gl_Vertex.xyz, 1.0); + + gl_FrontColor = gl_Color; +} +); + +#endif // pixel shader for rendering points as shaded spheres const char *spherePixelShader = STRINGIFY( diff --git a/Demos/SharedOpenCL/btOclUtils.cpp b/Demos/SharedOpenCL/btOclUtils.cpp index bca361449..7af73b92a 100644 --- a/Demos/SharedOpenCL/btOclUtils.cpp +++ b/Demos/SharedOpenCL/btOclUtils.cpp @@ -9,6 +9,7 @@ + ////////////////////////////////////////////////////////////////////////////// //! Gets the id of the nth device from the context //! diff --git a/Demos/ThreadingDemo/main.cpp b/Demos/ThreadingDemo/main.cpp index e5168a76c..a828dc6cd 100644 --- a/Demos/ThreadingDemo/main.cpp +++ b/Demos/ThreadingDemo/main.cpp @@ -23,6 +23,8 @@ subject to the following restrictions: void SampleThreadFunc(void* userPtr,void* lsMemory); void* SamplelsMemoryFunc(); +#include + #ifdef __APPLE__ #include "BulletMultiThreaded/PosixThreadSupport.h" @@ -148,7 +150,7 @@ int main(int argc,char** argv) int numActiveThreads = numThreads; while (numActiveThreads) { - if (threadSupport->isTaskCompleted(&arg0,&arg1,0)) + if (((Win32ThreadSupport*)threadSupport)->isTaskCompleted(&arg0,&arg1,0)) { numActiveThreads--; printf("numActiveThreads = %d\n",numActiveThreads); diff --git a/Demos/VectorAdd_OpenCL/MiniCL_VectorAdd.cpp b/Demos/VectorAdd_OpenCL/MiniCL_VectorAdd.cpp index 92db86225..e838c6768 100644 --- a/Demos/VectorAdd_OpenCL/MiniCL_VectorAdd.cpp +++ b/Demos/VectorAdd_OpenCL/MiniCL_VectorAdd.cpp @@ -192,7 +192,7 @@ int main(int argc, char **argv) }; // Create OpenCL context & context - cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_CPU, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU + cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU // Query all devices available to the context ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes); diff --git a/Demos/VectorAdd_OpenCL/NVidia/CMakeLists.txt b/Demos/VectorAdd_OpenCL/NVidia/CMakeLists.txt index 30db18d2e..05adb0575 100644 --- a/Demos/VectorAdd_OpenCL/NVidia/CMakeLists.txt +++ b/Demos/VectorAdd_OpenCL/NVidia/CMakeLists.txt @@ -4,16 +4,16 @@ IF(INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES) INCLUDE_DIRECTORIES( $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/inc ) IF (CMAKE_CL_64) - SET(CMAKE_NVSDKCOMPUTE_LIBPATH ) + SET(CMAK_NVSDKCOMPUTE_LIBPATH $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/x64 ) ELSE(CMAKE_CL_64) - SET(CMAKE_NVSDKCOMPUTE_LIBPATH $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/x64 ) + SET(CMAK_NVSDKCOMPUTE_LIBPATH $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/Win32 ) ENDIF(CMAKE_CL_64) ELSE() INCLUDE_DIRECTORIES( $ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/inc ) IF (CMAKE_CL_64) - SET(CMAKE_NVSDKCOMPUTE_LIBPATH ) + SET(CMAK_NVSDKCOMPUTE_LIBPATH $ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/lib/x64 ) ELSE(CMAKE_CL_64) - SET(CMAKE_NVSDKCOMPUTE_LIBPATH $ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/lib/Win32 ) + SET(CMAK_NVSDKCOMPUTE_LIBPATH $ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/lib/Win32 ) ENDIF(CMAKE_CL_64) ENDIF() @@ -22,8 +22,8 @@ ${BULLET_PHYSICS_SOURCE_DIR}/src ) LINK_LIBRARIES( - BulletMultiThreaded LinearMath - ${CMAKE_NVSDKCOMPUTE_LIBPATH}/OpenCL.lib + LinearMath + ${CMAK_NVSDKCOMPUTE_LIBPATH}/OpenCL.lib ) ADD_EXECUTABLE(AppVectorAdd_NVidia diff --git a/src/BulletCollision/NarrowPhaseCollision/btManifoldPoint.h b/src/BulletCollision/NarrowPhaseCollision/btManifoldPoint.h index 5987699f1..53c9727bc 100644 --- a/src/BulletCollision/NarrowPhaseCollision/btManifoldPoint.h +++ b/src/BulletCollision/NarrowPhaseCollision/btManifoldPoint.h @@ -19,6 +19,15 @@ subject to the following restrictions: #include "LinearMath/btVector3.h" #include "LinearMath/btTransformUtil.h" +// Don't change following order of parameters +ATTRIBUTE_ALIGNED16(struct) PfxConstraintRow { + btScalar mNormal[3]; + btScalar mRhs; + btScalar mJacDiagInv; + btScalar mLowerLimit; + btScalar mUpperLimit; + btScalar mAccumImpulse; +}; @@ -62,8 +71,9 @@ class btManifoldPoint m_contactCFM2(0.f), m_lifeTime(0) { - - + mConstraintRow[0].mAccumImpulse = 0.f; + mConstraintRow[1].mAccumImpulse = 0.f; + mConstraintRow[2].mAccumImpulse = 0.f; } @@ -101,6 +111,11 @@ class btManifoldPoint btVector3 m_lateralFrictionDir1; btVector3 m_lateralFrictionDir2; + + + PfxConstraintRow mConstraintRow[3]; + + btScalar getDistance() const { return m_distance1; diff --git a/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h b/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h index c075c5480..9206129ed 100644 --- a/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h +++ b/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h @@ -48,7 +48,10 @@ enum btContactManifoldTypes ///reduces the cache to 4 points, when more then 4 points are added, using following rules: ///the contact point with deepest penetration is always kept, and it tries to maximuze the area covered by the points ///note that some pairs of objects might have more then one contact manifold. -ATTRIBUTE_ALIGNED16( class) btPersistentManifold : public btTypedObject + + +ATTRIBUTE_ALIGNED128( class) btPersistentManifold : public btTypedObject +//ATTRIBUTE_ALIGNED16( class) btPersistentManifold : public btTypedObject { btManifoldPoint m_pointCache[MANIFOLD_CACHE_SIZE]; @@ -57,6 +60,7 @@ ATTRIBUTE_ALIGNED16( class) btPersistentManifold : public btTypedObject /// void* will allow any rigidbody class void* m_body0; void* m_body1; + int m_cachedPoints; btScalar m_contactBreakingThreshold; @@ -72,6 +76,9 @@ public: BT_DECLARE_ALIGNED_ALLOCATOR(); + int m_companionIdA; + int m_companionIdB; + int m_index1a; btPersistentManifold(); @@ -139,6 +146,10 @@ public: m_pointCache[index] = m_pointCache[lastUsedIndex]; //get rid of duplicated userPersistentData pointer m_pointCache[lastUsedIndex].m_userPersistentData = 0; + m_pointCache[lastUsedIndex].mConstraintRow[0].mAccumImpulse = 0.f; + m_pointCache[lastUsedIndex].mConstraintRow[1].mAccumImpulse = 0.f; + m_pointCache[lastUsedIndex].mConstraintRow[2].mAccumImpulse = 0.f; + m_pointCache[lastUsedIndex].m_appliedImpulse = 0.f; m_pointCache[lastUsedIndex].m_lateralFrictionInitialized = false; m_pointCache[lastUsedIndex].m_appliedImpulseLateral1 = 0.f; @@ -156,10 +167,13 @@ public: #define MAINTAIN_PERSISTENCY 1 #ifdef MAINTAIN_PERSISTENCY int lifeTime = m_pointCache[insertIndex].getLifeTime(); - btScalar appliedImpulse = m_pointCache[insertIndex].m_appliedImpulse; - btScalar appliedLateralImpulse1 = m_pointCache[insertIndex].m_appliedImpulseLateral1; - btScalar appliedLateralImpulse2 = m_pointCache[insertIndex].m_appliedImpulseLateral2; - + btScalar appliedImpulse = m_pointCache[insertIndex].mConstraintRow[0].mAccumImpulse; + btScalar appliedLateralImpulse1 = m_pointCache[insertIndex].mConstraintRow[1].mAccumImpulse; + btScalar appliedLateralImpulse2 = m_pointCache[insertIndex].mConstraintRow[2].mAccumImpulse; + bool isLateralFrictionInitialized = m_pointCache[insertIndex].m_lateralFrictionInitialized; + + + btAssert(lifeTime>=0); void* cache = m_pointCache[insertIndex].m_userPersistentData; @@ -170,6 +184,11 @@ public: m_pointCache[insertIndex].m_appliedImpulseLateral1 = appliedLateralImpulse1; m_pointCache[insertIndex].m_appliedImpulseLateral2 = appliedLateralImpulse2; + m_pointCache[insertIndex].mConstraintRow[0].mAccumImpulse = appliedImpulse; + m_pointCache[insertIndex].mConstraintRow[1].mAccumImpulse = appliedLateralImpulse1; + m_pointCache[insertIndex].mConstraintRow[2].mAccumImpulse = appliedLateralImpulse2; + + m_pointCache[insertIndex].m_lifeTime = lifeTime; #else clearUserCache(m_pointCache[insertIndex]); diff --git a/src/BulletDynamics/ConstraintSolver/btSolverConstraint.h b/src/BulletDynamics/ConstraintSolver/btSolverConstraint.h index 929cf6d3e..0ce25e783 100644 --- a/src/BulletDynamics/ConstraintSolver/btSolverConstraint.h +++ b/src/BulletDynamics/ConstraintSolver/btSolverConstraint.h @@ -59,12 +59,12 @@ ATTRIBUTE_ALIGNED64 (struct) btSolverConstraint union { btRigidBody* m_solverBodyA; - btScalar m_unusedPadding2; + btScalar m_companionIdA; }; union { btRigidBody* m_solverBodyB; - btScalar m_unusedPadding3; + btScalar m_companionIdB; }; union diff --git a/src/BulletDynamics/Dynamics/btRigidBody.cpp b/src/BulletDynamics/Dynamics/btRigidBody.cpp index 714058d5d..5328ed229 100644 --- a/src/BulletDynamics/Dynamics/btRigidBody.cpp +++ b/src/BulletDynamics/Dynamics/btRigidBody.cpp @@ -148,8 +148,8 @@ void btRigidBody::setGravity(const btVector3& acceleration) void btRigidBody::setDamping(btScalar lin_damping, btScalar ang_damping) { - m_linearDamping = GEN_clamped(lin_damping, (btScalar)btScalar(0.0), (btScalar)btScalar(1.0)); - m_angularDamping = GEN_clamped(ang_damping, (btScalar)btScalar(0.0), (btScalar)btScalar(1.0)); + m_linearDamping = btClamped(lin_damping, (btScalar)btScalar(0.0), (btScalar)btScalar(1.0)); + m_angularDamping = btClamped(ang_damping, (btScalar)btScalar(0.0), (btScalar)btScalar(1.0)); } diff --git a/src/BulletMultiThreaded/CMakeLists.txt b/src/BulletMultiThreaded/CMakeLists.txt index 31238b275..6a995d058 100644 --- a/src/BulletMultiThreaded/CMakeLists.txt +++ b/src/BulletMultiThreaded/CMakeLists.txt @@ -1,6 +1,6 @@ INCLUDE_DIRECTORIES( ${BULLET_PHYSICS_SOURCE_DIR}/src - ${BULLET_PHYSICS_SOURCE_DIR}/src/BulletMultiThreaded/vectormath/scalar/cpp + ${VECTOR_MATH_INCLUDE} ) ADD_LIBRARY(BulletMultiThreaded @@ -37,6 +37,11 @@ ADD_LIBRARY(BulletMultiThreaded SpuNarrowPhaseCollisionTask/Box.h SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp SpuNarrowPhaseCollisionTask/boxBoxDistance.h +# SPURS_PEGatherScatterTask/SpuPEGatherScatterTask.cpp +# SPURS_PEGatherScatterTask/SpuPEGatherScatterTask.h +# SpuPEGatherScatterTaskProcess.cpp +# SpuPEGatherScatterTaskProcess.h + SpuNarrowPhaseCollisionTask/SpuContactResult.cpp SpuNarrowPhaseCollisionTask/SpuContactResult.h SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp diff --git a/src/BulletMultiThreaded/HeapManager.h b/src/BulletMultiThreaded/HeapManager.h new file mode 100644 index 000000000..7d0f499c4 --- /dev/null +++ b/src/BulletMultiThreaded/HeapManager.h @@ -0,0 +1,116 @@ +/* + Copyright (C) 2009 Sony Computer Entertainment Inc. + All rights reserved. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. + +*/ + +#ifndef __HEAP_MANAGER_H__ +#define __HEAP_MANAGER_H__ + +#ifdef __SPU__ + #define HEAP_STACK_SIZE 32 +#else + #define HEAP_STACK_SIZE 64 +#endif + +#define MIN_ALLOC_SIZE 16 + + +class HeapManager +{ +private: + ATTRIBUTE_ALIGNED16(unsigned char *mHeap); + ATTRIBUTE_ALIGNED16(unsigned int mHeapBytes); + ATTRIBUTE_ALIGNED16(unsigned char *mPoolStack[HEAP_STACK_SIZE]); + ATTRIBUTE_ALIGNED16(unsigned int mCurStack); + +public: + enum {ALIGN16,ALIGN128}; + + HeapManager(unsigned char *buf,int bytes) + { + mHeap = buf; + mHeapBytes = bytes; + clear(); + } + + ~HeapManager() + { + } + + int getAllocated() + { + return (int)(mPoolStack[mCurStack]-mHeap); + } + + int getRest() + { + return mHeapBytes-getAllocated(); + } + + void *allocate(size_t bytes,int alignment = ALIGN16) + { + if(bytes <= 0) bytes = MIN_ALLOC_SIZE; + btAssert(mCurStack < (HEAP_STACK_SIZE-1)); + + +#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64__) + unsigned long long p = (unsigned long long )mPoolStack[mCurStack]; + if(alignment == ALIGN128) { + p = ((p+127) & 0xffffffffffffff80); + bytes = (bytes+127) & 0xffffffffffffff80; + } + else { + bytes = (bytes+15) & 0xfffffffffffffff0; + } + + btAssert(bytes <=(mHeapBytes-(p-(unsigned long long )mHeap)) ); + +#else + unsigned long p = (unsigned long )mPoolStack[mCurStack]; + if(alignment == ALIGN128) { + p = ((p+127) & 0xffffff80); + bytes = (bytes+127) & 0xffffff80; + } + else { + bytes = (bytes+15) & 0xfffffff0; + } + btAssert(bytes <=(mHeapBytes-(p-(unsigned long)mHeap)) ); +#endif + unsigned char * bla = (unsigned char *)(p + bytes); + mPoolStack[++mCurStack] = bla; + return (void*)p; + } + + void deallocate(void *p) + { + (void) p; + mCurStack--; + } + + void clear() + { + mPoolStack[0] = mHeap; + mCurStack = 0; + } + +// void printStack() +// { +// for(unsigned int i=0;i<=mCurStack;i++) { +// PRINTF("memStack %2d 0x%x\n",i,(uint32_t)mPoolStack[i]); +// } +// } + +}; + +#endif diff --git a/src/BulletMultiThreaded/MiniCL.cpp b/src/BulletMultiThreaded/MiniCL.cpp index 129ee2734..56519013f 100644 --- a/src/BulletMultiThreaded/MiniCL.cpp +++ b/src/BulletMultiThreaded/MiniCL.cpp @@ -25,8 +25,9 @@ subject to the following restrictions: #include "MiniCLTaskScheduler.h" #include "MiniCLTask/MiniCLTask.h" #include "LinearMath/btMinMax.h" +#include -//#define DEBUG_MINICL_KERNELS 1 +#define DEBUG_MINICL_KERNELS 1 static char* spPlatformID = "MiniCL, SCEA"; @@ -316,7 +317,7 @@ static void* localBufMalloc(int size) if((sLocalBufUsed + size16) > LOCAL_BUF_SIZE) { // reset spLocalBufCurr = sLocalMemBuf; - while((int)spLocalBufCurr & 0x0F) spLocalBufCurr++; // align to 16 bytes + while((unsigned long)spLocalBufCurr & 0x0F) spLocalBufCurr++; // align to 16 bytes sLocalBufUsed = 0; } void* ret = spLocalBufCurr; @@ -339,8 +340,8 @@ CL_API_ENTRY cl_int CL_API_CALL clSetKernelArg(cl_kernel clKernel , printf("error: clSetKernelArg arg_index (%d) exceeds %d\n",arg_index,MINI_CL_MAX_ARG); } else { -// if (arg_size>=MINICL_MAX_ARGLENGTH) - if (arg_size != MINICL_MAX_ARGLENGTH) + if (arg_size>MINICL_MAX_ARGLENGTH) + //if (arg_size != MINICL_MAX_ARGLENGTH) { printf("error: clSetKernelArg argdata too large: %d (maximum is %d)\n",arg_size,MINICL_MAX_ARGLENGTH); } @@ -559,12 +560,12 @@ clGetKernelWorkGroupInfo(cl_kernel kernel , size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0 { if((wgi == CL_KERNEL_WORK_GROUP_SIZE) - &&(sz == sizeof(int)) + &&(sz == sizeof(size_t)) &&(ptr != NULL)) { MiniCLKernel* miniCLKernel = (MiniCLKernel*)kernel; MiniCLTaskScheduler* scheduler = miniCLKernel->m_scheduler; - *((int*)ptr) = scheduler->getMaxNumOutstandingTasks(); + *((size_t*)ptr) = scheduler->getMaxNumOutstandingTasks(); return CL_SUCCESS; } else diff --git a/src/BulletMultiThreaded/PlatformDefinitions.h b/src/BulletMultiThreaded/PlatformDefinitions.h index 16362f4bc..68c16e15e 100644 --- a/src/BulletMultiThreaded/PlatformDefinitions.h +++ b/src/BulletMultiThreaded/PlatformDefinitions.h @@ -2,6 +2,18 @@ #define TYPE_DEFINITIONS_H ///This file provides some platform/compiler checks for common definitions +#include "LinearMath/btScalar.h" +#include "LinearMath/btMinMax.h" + +#include +typedef Vectormath::Aos::Vector3 vmVector3; +typedef Vectormath::Aos::Quat vmQuat; +typedef Vectormath::Aos::Matrix3 vmMatrix3; +typedef Vectormath::Aos::Transform3 vmTransform3; +typedef Vectormath::Aos::Point3 vmPoint3; + + + #ifdef _WIN32 @@ -19,9 +31,11 @@ typedef union typedef unsigned char uint8_t; #ifndef __PHYSICS_COMMON_H__ +#ifndef __PFX_COMMON_H__ #ifndef __BT_SKIP_UINT64_H typedef unsigned long int uint64_t; #endif //__BT_SKIP_UINT64_H +#endif //__PFX_COMMON_H__ typedef unsigned int uint32_t; #endif //__PHYSICS_COMMON_H__ typedef unsigned short uint16_t; @@ -54,26 +68,22 @@ typedef union #include #define spu_printf printf #define DWORD unsigned int - typedef union { unsigned long long ull; unsigned int ui[2]; void *p; } addr64; - - -#else - -#include -#define spu_printf printf - #endif // USE_LIBSPE2 - + #endif //__CELLOS_LV2__ #endif +#ifdef __SPU__ +#include +#define printf spu_printf +#endif /* Included here because we need uint*_t typedefs */ #include "PpuAddressSpace.h" diff --git a/src/BulletMultiThreaded/PpuAddressSpace.h b/src/BulletMultiThreaded/PpuAddressSpace.h index f36fdfb3c..833b22bf9 100644 --- a/src/BulletMultiThreaded/PpuAddressSpace.h +++ b/src/BulletMultiThreaded/PpuAddressSpace.h @@ -8,8 +8,9 @@ #pragma warning (disable: 4312) #endif //_WIN32 -#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64__) || defined(USE_ADDR64) -typedef uint64_t ppu_address_t; + +#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64__) +typedef unsigned __int64 ppu_address_t; #else typedef uint32_t ppu_address_t; diff --git a/src/BulletMultiThreaded/SequentialThreadSupport.cpp b/src/BulletMultiThreaded/SequentialThreadSupport.cpp index 4e9c822bb..8cc72418f 100644 --- a/src/BulletMultiThreaded/SequentialThreadSupport.cpp +++ b/src/BulletMultiThreaded/SequentialThreadSupport.cpp @@ -91,3 +91,79 @@ void SequentialThreadSupport::setNumTasks(int numTasks) { printf("SequentialThreadSupport::setNumTasks(%d) is not implemented and has no effect\n",numTasks); } + + + + +class btDummyBarrier : public btBarrier +{ +private: + +public: + btDummyBarrier() + { + } + + virtual ~btDummyBarrier() + { + } + + void sync() + { + } + + virtual void setMaxCount(int n) {} + virtual int getMaxCount() {return 1;} +}; + +class btDummyCriticalSection : public btCriticalSection +{ + +public: + btDummyCriticalSection() + { + } + + virtual ~btDummyCriticalSection() + { + } + + unsigned int getSharedParam(int i) + { + btAssert(i>=0&&i<31); + return mCommonBuff[i+1]; + } + + void setSharedParam(int i,unsigned int p) + { + btAssert(i>=0&&i<31); + mCommonBuff[i+1] = p; + } + + void lock() + { + mCommonBuff[0] = 1; + } + + void unlock() + { + mCommonBuff[0] = 0; + } +}; + + + + +btBarrier* SequentialThreadSupport::createBarrier() +{ + return new btDummyBarrier(); +} + +btCriticalSection* SequentialThreadSupport::createCriticalSection() +{ + return new btDummyCriticalSection(); + +} + + + diff --git a/src/BulletMultiThreaded/SequentialThreadSupport.h b/src/BulletMultiThreaded/SequentialThreadSupport.h index 514e54ee9..2b65a4842 100644 --- a/src/BulletMultiThreaded/SequentialThreadSupport.h +++ b/src/BulletMultiThreaded/SequentialThreadSupport.h @@ -85,10 +85,10 @@ public: { return 1; } - virtual btBarrier* createBarrier() { return 0;} - - virtual btCriticalSection* createCriticalSection() { return 0;}; + virtual btBarrier* createBarrier(); + virtual btCriticalSection* createCriticalSection(); + }; diff --git a/src/BulletMultiThreaded/SpuFakeDma.cpp b/src/BulletMultiThreaded/SpuFakeDma.cpp index 62cef3961..61e1a49d6 100644 --- a/src/BulletMultiThreaded/SpuFakeDma.cpp +++ b/src/BulletMultiThreaded/SpuFakeDma.cpp @@ -174,6 +174,9 @@ int cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, { char* mainMem = (char*)ea; char* localStore = (char*)ls; + +// printf("mainMem=%x, localStore=%x",mainMem,localStore); + #ifdef USE_MEMCPY memcpy(localStore,mainMem,size); #else @@ -182,6 +185,7 @@ int cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, localStore[i] = mainMem[i]; } #endif //#ifdef USE_MEMCPY +// printf(" finished\n"); return 0; } diff --git a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h index 9bc2ebf51..e07c731ef 100644 --- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h +++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h @@ -25,15 +25,11 @@ subject to the following restrictions: #include ///only use a system-wide vectormath_aos.h on CELLOS_LV2 or if USE_SYSTEM_VECTORMATH -#if defined(__CELLOS_LV2__) || defined (USE_SYSTEM_VECTORMATH) -#include -#else -#include "BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h" -#endif +#include "vectormath_aos.h" +#include "../PlatformDefinitions.h" -using namespace Vectormath::Aos; enum FeatureType { F, E, V }; @@ -44,21 +40,21 @@ enum FeatureType { F, E, V }; class Box { public: - Vector3 half; + vmVector3 mHalf; inline Box() {} - inline Box(PE_REF(Vector3) half_); + inline Box(PE_REF(vmVector3) half_); inline Box(float hx, float hy, float hz); - inline void Set(PE_REF(Vector3) half_); + inline void Set(PE_REF(vmVector3) half_); inline void Set(float hx, float hy, float hz); - inline Vector3 GetAABB(const Matrix3& rotation) const; + inline vmVector3 GetAABB(const vmMatrix3& rotation) const; }; inline -Box::Box(PE_REF(Vector3) half_) +Box::Box(PE_REF(vmVector3) half_) { Set(half_); } @@ -71,23 +67,23 @@ Box::Box(float hx, float hy, float hz) inline void -Box::Set(PE_REF(Vector3) half_) +Box::Set(PE_REF(vmVector3) half_) { - half = half_; + mHalf = half_; } inline void Box::Set(float hx, float hy, float hz) { - half = Vector3(hx, hy, hz); + mHalf = vmVector3(hx, hy, hz); } inline -Vector3 -Box::GetAABB(const Matrix3& rotation) const +vmVector3 +Box::GetAABB(const vmMatrix3& rotation) const { - return absPerElem(rotation) * half; + return absPerElem(rotation) * mHalf; } //------------------------------------------------------------------------------------------------- @@ -100,7 +96,7 @@ class BoxPoint public: BoxPoint() : localPoint(0.0f) {} - Point3 localPoint; + vmPoint3 localPoint; FeatureType featureType; int featureIdx; diff --git a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp index 8e540d929..a71095380 100644 --- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp +++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp @@ -22,7 +22,7 @@ subject to the following restrictions: #include #define spu_printf printf #endif -#endif DEBUG_SPU_COLLISION_DETECTION +#endif //DEBUG_SPU_COLLISION_DETECTION SpuContactResult::SpuContactResult() { diff --git a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp index 80a98ffc7..dfab98028 100644 --- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp +++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp @@ -52,7 +52,7 @@ subject to the following restrictions: #ifdef __SPU__ ///Software caching from the IBM Cell SDK, it reduces 25% SPU time for our test cases #ifndef USE_LIBSPE2 -#define USE_SOFTWARE_CACHE 1 +//#define USE_SOFTWARE_CACHE 1 #endif #endif //__SPU__ @@ -1216,7 +1216,7 @@ void processCollisionTask(void* userPtr, void* lsMemPtr) #endif ) { -//#define USE_PE_BOX_BOX 1 +#define USE_PE_BOX_BOX 1 #ifdef USE_PE_BOX_BOX { @@ -1225,38 +1225,64 @@ void processCollisionTask(void* userPtr, void* lsMemPtr) btScalar margin1 = lsMem.getlocalCollisionAlgorithm()->getCollisionMargin1(); btVector3 shapeDim0 = lsMem.getlocalCollisionAlgorithm()->getShapeDimensions0()+btVector3(margin0,margin0,margin0); btVector3 shapeDim1 = lsMem.getlocalCollisionAlgorithm()->getShapeDimensions1()+btVector3(margin1,margin1,margin1); +/* + //Box boxA(shapeDim0.getX(),shapeDim0.getY(),shapeDim0.getZ()); + vmVector3 vmPos0 = getVmVector3(collisionPairInput.m_worldTransform0.getOrigin()); + vmVector3 vmPos1 = getVmVector3(collisionPairInput.m_worldTransform1.getOrigin()); + vmMatrix3 vmMatrix0 = getVmMatrix3(collisionPairInput.m_worldTransform0.getBasis()); + vmMatrix3 vmMatrix1 = getVmMatrix3(collisionPairInput.m_worldTransform1.getBasis()); - Box boxA(shapeDim0.getX(),shapeDim0.getY(),shapeDim0.getZ()); - Vector3 vmPos0 = getVmVector3(collisionPairInput.m_worldTransform0.getOrigin()); - Vector3 vmPos1 = getVmVector3(collisionPairInput.m_worldTransform1.getOrigin()); - Matrix3 vmMatrix0 = getVmMatrix3(collisionPairInput.m_worldTransform0.getBasis()); - Matrix3 vmMatrix1 = getVmMatrix3(collisionPairInput.m_worldTransform1.getBasis()); - - Transform3 transformA(vmMatrix0,vmPos0); + vmTransform3 transformA(vmMatrix0,vmPos0); Box boxB(shapeDim1.getX(),shapeDim1.getY(),shapeDim1.getZ()); - Transform3 transformB(vmMatrix1,vmPos1); + vmTransform3 transformB(vmMatrix1,vmPos1); BoxPoint resultClosestBoxPointA; BoxPoint resultClosestBoxPointB; - Vector3 resultNormal; + vmVector3 resultNormal; + */ + #ifdef USE_SEPDISTANCE_UTIL float distanceThreshold = FLT_MAX #else - float distanceThreshold = 0.f; + //float distanceThreshold = 0.f; #endif - distance = boxBoxDistance(resultNormal,resultClosestBoxPointA,resultClosestBoxPointB, boxA, transformA, boxB,transformB,distanceThreshold); + vmVector3 n; + Box boxA; + vmVector3 hA(shapeDim0.getX(),shapeDim0.getY(),shapeDim0.getZ()); + vmVector3 hB(shapeDim1.getX(),shapeDim1.getY(),shapeDim1.getZ()); + boxA.mHalf= hA; + vmTransform3 trA; + trA.setTranslation(getVmVector3(collisionPairInput.m_worldTransform0.getOrigin())); + trA.setUpper3x3(getVmMatrix3(collisionPairInput.m_worldTransform0.getBasis())); + Box boxB; + boxB.mHalf = hB; + vmTransform3 trB; + trB.setTranslation(getVmVector3(collisionPairInput.m_worldTransform1.getOrigin())); + trB.setUpper3x3(getVmMatrix3(collisionPairInput.m_worldTransform1.getBasis())); - normalInB = -getBtVector3(resultNormal); + float distanceThreshold = spuManifold->getContactBreakingThreshold();//0.001f; - if(distance < spuManifold->getContactBreakingThreshold()) + + BoxPoint ptA,ptB; + float dist = boxBoxDistance(n, ptA, ptB, + boxA, trA, boxB, trB, + distanceThreshold ); + + +// float distance = boxBoxDistance(resultNormal,resultClosestBoxPointA,resultClosestBoxPointB, boxA, transformA, boxB,transformB,distanceThreshold); + + normalInB = -getBtVector3(n);//resultNormal); + + //if(dist < distanceThreshold)//spuManifold->getContactBreakingThreshold()) + if(dist < spuManifold->getContactBreakingThreshold()) { - btVector3 pointOnB = collisionPairInput.m_worldTransform1(getBtVector3(resultClosestBoxPointB.localPoint)); + btVector3 pointOnB = collisionPairInput.m_worldTransform1(getBtVector3(ptB.localPoint)); spuContacts.addContactPoint( normalInB, pointOnB, - distance); + dist); } } #else diff --git a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp index 30642a392..5e1202c01 100644 --- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp +++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp @@ -15,7 +15,11 @@ subject to the following restrictions: */ -#include "Box.h" +//#include "PfxContactBoxBox.h" + +#include +#include "../PlatformDefinitions.h" +#include "boxBoxDistance.h" static inline float sqr( float a ) { @@ -114,18 +118,18 @@ VertexBFaceATest( bool & inVoronoi, float & t0, float & t1, - const Vector3 & hA, - PE_REF(Vector3) faceOffsetAB, - PE_REF(Vector3) faceOffsetBA, - const Matrix3 & matrixAB, - const Matrix3 & matrixBA, - PE_REF(Vector3) signsB, - PE_REF(Vector3) scalesB ) + const vmVector3 & hA, + PE_REF(vmVector3) faceOffsetAB, + PE_REF(vmVector3) faceOffsetBA, + const vmMatrix3 & matrixAB, + const vmMatrix3 & matrixBA, + PE_REF(vmVector3) signsB, + PE_REF(vmVector3) scalesB ) { // compute a corner of box B in A's coordinate system - Vector3 corner = - Vector3( faceOffsetAB + matrixAB.getCol0() * scalesB.getX() + matrixAB.getCol1() * scalesB.getY() ); + vmVector3 corner = + vmVector3( faceOffsetAB + matrixAB.getCol0() * scalesB.getX() + matrixAB.getCol1() * scalesB.getY() ); // compute the parameters of the point on A, closest to this corner @@ -144,8 +148,8 @@ VertexBFaceATest( // do the Voronoi test: already know the point on B is in the Voronoi region of the // point on A, check the reverse. - Vector3 facePointB = - Vector3( mulPerElem( faceOffsetBA + matrixBA.getCol0() * t0 + matrixBA.getCol1() * t1 - scalesB, signsB ) ); + vmVector3 facePointB = + vmVector3( mulPerElem( faceOffsetBA + matrixBA.getCol0() * t0 + matrixBA.getCol1() * t1 - scalesB, signsB ) ); inVoronoi = ( ( facePointB[0] >= voronoiTol * facePointB[2] ) && ( facePointB[1] >= voronoiTol * facePointB[0] ) && @@ -169,17 +173,17 @@ void VertexBFaceATests( bool & done, float & minDistSqr, - Point3 & localPointA, - Point3 & localPointB, + vmPoint3 & localPointA, + vmPoint3 & localPointB, FeatureType & featureA, FeatureType & featureB, - const Vector3 & hA, - PE_REF(Vector3) faceOffsetAB, - PE_REF(Vector3) faceOffsetBA, - const Matrix3 & matrixAB, - const Matrix3 & matrixBA, - PE_REF(Vector3) signsB, - PE_REF(Vector3) scalesB, + const vmVector3 & hA, + PE_REF(vmVector3) faceOffsetAB, + PE_REF(vmVector3) faceOffsetBA, + const vmMatrix3 & matrixAB, + const vmMatrix3 & matrixBA, + PE_REF(vmVector3) signsB, + PE_REF(vmVector3) scalesB, bool first ) { @@ -247,16 +251,16 @@ VertexAFaceBTest( bool & inVoronoi, float & t0, float & t1, - const Vector3 & hB, - PE_REF(Vector3) faceOffsetAB, - PE_REF(Vector3) faceOffsetBA, - const Matrix3 & matrixAB, - const Matrix3 & matrixBA, - PE_REF(Vector3) signsA, - PE_REF(Vector3) scalesA ) + const vmVector3 & hB, + PE_REF(vmVector3) faceOffsetAB, + PE_REF(vmVector3) faceOffsetBA, + const vmMatrix3 & matrixAB, + const vmMatrix3 & matrixBA, + PE_REF(vmVector3) signsA, + PE_REF(vmVector3) scalesA ) { - Vector3 corner = - Vector3( faceOffsetBA + matrixBA.getCol0() * scalesA.getX() + matrixBA.getCol1() * scalesA.getY() ); + vmVector3 corner = + vmVector3( faceOffsetBA + matrixBA.getCol0() * scalesA.getX() + matrixBA.getCol1() * scalesA.getY() ); t0 = corner[0]; t1 = corner[1]; @@ -270,8 +274,8 @@ VertexAFaceBTest( else if ( t1 < -hB[1] ) t1 = -hB[1]; - Vector3 facePointA = - Vector3( mulPerElem( faceOffsetAB + matrixAB.getCol0() * t0 + matrixAB.getCol1() * t1 - scalesA, signsA ) ); + vmVector3 facePointA = + vmVector3( mulPerElem( faceOffsetAB + matrixAB.getCol0() * t0 + matrixAB.getCol1() * t1 - scalesA, signsA ) ); inVoronoi = ( ( facePointA[0] >= voronoiTol * facePointA[2] ) && ( facePointA[1] >= voronoiTol * facePointA[0] ) && @@ -295,17 +299,17 @@ void VertexAFaceBTests( bool & done, float & minDistSqr, - Point3 & localPointA, - Point3 & localPointB, + vmPoint3 & localPointA, + vmPoint3 & localPointB, FeatureType & featureA, FeatureType & featureB, - const Vector3 & hB, - PE_REF(Vector3) faceOffsetAB, - PE_REF(Vector3) faceOffsetBA, - const Matrix3 & matrixAB, - const Matrix3 & matrixBA, - PE_REF(Vector3) signsA, - PE_REF(Vector3) scalesA, + const vmVector3 & hB, + PE_REF(vmVector3) faceOffsetAB, + PE_REF(vmVector3) faceOffsetBA, + const vmMatrix3 & matrixAB, + const vmMatrix3 & matrixBA, + PE_REF(vmVector3) signsA, + PE_REF(vmVector3) scalesA, bool first ) { float t0, t1; @@ -363,7 +367,7 @@ VertexAFaceBTests( } //------------------------------------------------------------------------------------------------- -// EdgeEdgeTest: +// CustomEdgeEdgeTest: // // tests whether a pair of edges are the closest features // @@ -374,10 +378,10 @@ VertexAFaceBTests( // the dimension of the face normal is 2 //------------------------------------------------------------------------------------------------- -#define EdgeEdgeTest( ac, ac_letter, ad, ad_letter, bc, bc_letter, bd, bd_letter ) \ +#define CustomEdgeEdgeTest( ac, ac_letter, ad, ad_letter, bc, bc_letter, bd, bd_letter ) \ { \ - Vector3 edgeOffsetAB; \ - Vector3 edgeOffsetBA; \ + vmVector3 edgeOffsetAB; \ + vmVector3 edgeOffsetBA; \ \ edgeOffsetAB = faceOffsetAB + matrixAB.getCol##bc() * scalesB.get##bc_letter(); \ edgeOffsetAB.set##ac_letter( edgeOffsetAB.get##ac_letter() - scalesA.get##ac_letter() ); \ @@ -421,8 +425,8 @@ VertexAFaceBTests( else if ( tA > hA[ad] ) tA = hA[ad]; \ } \ \ - Vector3 edgeOffAB = Vector3( mulPerElem( edgeOffsetAB + matrixAB.getCol##bd() * tB, signsA ) );\ - Vector3 edgeOffBA = Vector3( mulPerElem( edgeOffsetBA + matrixBA.getCol##ad() * tA, signsB ) );\ + vmVector3 edgeOffAB = vmVector3( mulPerElem( edgeOffsetAB + matrixAB.getCol##bd() * tB, signsA ) );\ + vmVector3 edgeOffBA = vmVector3( mulPerElem( edgeOffsetBA + matrixBA.getCol##ad() * tA, signsB ) );\ \ inVoronoi = ( edgeOffAB[ac] >= voronoiTol * edgeOffAB[2] ) && \ ( edgeOffAB[2] >= voronoiTol * edgeOffAB[ac] ) && \ @@ -436,79 +440,79 @@ VertexAFaceBTests( } float -EdgeEdgeTest_0101( +CustomEdgeEdgeTest_0101( bool & inVoronoi, float & tA, float & tB, - const Vector3 & hA, - const Vector3 & hB, - PE_REF(Vector3) faceOffsetAB, - PE_REF(Vector3) faceOffsetBA, - const Matrix3 & matrixAB, - const Matrix3 & matrixBA, - PE_REF(Vector3) signsA, - PE_REF(Vector3) signsB, - PE_REF(Vector3) scalesA, - PE_REF(Vector3) scalesB ) + const vmVector3 & hA, + const vmVector3 & hB, + PE_REF(vmVector3) faceOffsetAB, + PE_REF(vmVector3) faceOffsetBA, + const vmMatrix3 & matrixAB, + const vmMatrix3 & matrixBA, + PE_REF(vmVector3) signsA, + PE_REF(vmVector3) signsB, + PE_REF(vmVector3) scalesA, + PE_REF(vmVector3) scalesB ) { - EdgeEdgeTest( 0, X, 1, Y, 0, X, 1, Y ); + CustomEdgeEdgeTest( 0, X, 1, Y, 0, X, 1, Y ); } float -EdgeEdgeTest_0110( +CustomEdgeEdgeTest_0110( bool & inVoronoi, float & tA, float & tB, - const Vector3 & hA, - const Vector3 & hB, - PE_REF(Vector3) faceOffsetAB, - PE_REF(Vector3) faceOffsetBA, - const Matrix3 & matrixAB, - const Matrix3 & matrixBA, - PE_REF(Vector3) signsA, - PE_REF(Vector3) signsB, - PE_REF(Vector3) scalesA, - PE_REF(Vector3) scalesB ) + const vmVector3 & hA, + const vmVector3 & hB, + PE_REF(vmVector3) faceOffsetAB, + PE_REF(vmVector3) faceOffsetBA, + const vmMatrix3 & matrixAB, + const vmMatrix3 & matrixBA, + PE_REF(vmVector3) signsA, + PE_REF(vmVector3) signsB, + PE_REF(vmVector3) scalesA, + PE_REF(vmVector3) scalesB ) { - EdgeEdgeTest( 0, X, 1, Y, 1, Y, 0, X ); + CustomEdgeEdgeTest( 0, X, 1, Y, 1, Y, 0, X ); } float -EdgeEdgeTest_1001( +CustomEdgeEdgeTest_1001( bool & inVoronoi, float & tA, float & tB, - const Vector3 & hA, - const Vector3 & hB, - PE_REF(Vector3) faceOffsetAB, - PE_REF(Vector3) faceOffsetBA, - const Matrix3 & matrixAB, - const Matrix3 & matrixBA, - PE_REF(Vector3) signsA, - PE_REF(Vector3) signsB, - PE_REF(Vector3) scalesA, - PE_REF(Vector3) scalesB ) + const vmVector3 & hA, + const vmVector3 & hB, + PE_REF(vmVector3) faceOffsetAB, + PE_REF(vmVector3) faceOffsetBA, + const vmMatrix3 & matrixAB, + const vmMatrix3 & matrixBA, + PE_REF(vmVector3) signsA, + PE_REF(vmVector3) signsB, + PE_REF(vmVector3) scalesA, + PE_REF(vmVector3) scalesB ) { - EdgeEdgeTest( 1, Y, 0, X, 0, X, 1, Y ); + CustomEdgeEdgeTest( 1, Y, 0, X, 0, X, 1, Y ); } float -EdgeEdgeTest_1010( +CustomEdgeEdgeTest_1010( bool & inVoronoi, float & tA, float & tB, - const Vector3 & hA, - const Vector3 & hB, - PE_REF(Vector3) faceOffsetAB, - PE_REF(Vector3) faceOffsetBA, - const Matrix3 & matrixAB, - const Matrix3 & matrixBA, - PE_REF(Vector3) signsA, - PE_REF(Vector3) signsB, - PE_REF(Vector3) scalesA, - PE_REF(Vector3) scalesB ) + const vmVector3 & hA, + const vmVector3 & hB, + PE_REF(vmVector3) faceOffsetAB, + PE_REF(vmVector3) faceOffsetBA, + const vmMatrix3 & matrixAB, + const vmMatrix3 & matrixBA, + PE_REF(vmVector3) signsA, + PE_REF(vmVector3) signsB, + PE_REF(vmVector3) scalesA, + PE_REF(vmVector3) scalesB ) { - EdgeEdgeTest( 1, Y, 0, X, 1, Y, 0, X ); + CustomEdgeEdgeTest( 1, Y, 0, X, 1, Y, 0, X ); } #define EdgeEdge_SetNewMin( ac_letter, ad_letter, bc_letter, bd_letter ) \ @@ -528,22 +532,22 @@ void EdgeEdgeTests( bool & done, float & minDistSqr, - Point3 & localPointA, - Point3 & localPointB, + vmPoint3 & localPointA, + vmPoint3 & localPointB, int & otherFaceDimA, int & otherFaceDimB, FeatureType & featureA, FeatureType & featureB, - const Vector3 & hA, - const Vector3 & hB, - PE_REF(Vector3) faceOffsetAB, - PE_REF(Vector3) faceOffsetBA, - const Matrix3 & matrixAB, - const Matrix3 & matrixBA, - PE_REF(Vector3) signsA, - PE_REF(Vector3) signsB, - PE_REF(Vector3) scalesA, - PE_REF(Vector3) scalesB, + const vmVector3 & hA, + const vmVector3 & hB, + PE_REF(vmVector3) faceOffsetAB, + PE_REF(vmVector3) faceOffsetBA, + const vmMatrix3 & matrixAB, + const vmMatrix3 & matrixBA, + PE_REF(vmVector3) signsA, + PE_REF(vmVector3) signsB, + PE_REF(vmVector3) scalesA, + PE_REF(vmVector3) scalesB, bool first ) { @@ -555,7 +559,7 @@ EdgeEdgeTests( testOtherFaceDimA = 0; testOtherFaceDimB = 0; - distSqr = EdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( first ) { @@ -572,7 +576,7 @@ EdgeEdgeTests( signsA.setX( -signsA.getX() ); scalesA.setX( -scalesA.getX() ); - distSqr = EdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -585,7 +589,7 @@ EdgeEdgeTests( signsB.setX( -signsB.getX() ); scalesB.setX( -scalesB.getX() ); - distSqr = EdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -598,7 +602,7 @@ EdgeEdgeTests( signsA.setX( -signsA.getX() ); scalesA.setX( -scalesA.getX() ); - distSqr = EdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -613,7 +617,7 @@ EdgeEdgeTests( signsB.setX( -signsB.getX() ); scalesB.setX( -scalesB.getX() ); - distSqr = EdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -626,7 +630,7 @@ EdgeEdgeTests( signsA.setY( -signsA.getY() ); scalesA.setY( -scalesA.getY() ); - distSqr = EdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -639,7 +643,7 @@ EdgeEdgeTests( signsB.setX( -signsB.getX() ); scalesB.setX( -scalesB.getX() ); - distSqr = EdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -652,7 +656,7 @@ EdgeEdgeTests( signsA.setY( -signsA.getY() ); scalesA.setY( -scalesA.getY() ); - distSqr = EdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -667,7 +671,7 @@ EdgeEdgeTests( signsB.setX( -signsB.getX() ); scalesB.setX( -scalesB.getX() ); - distSqr = EdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -680,7 +684,7 @@ EdgeEdgeTests( signsA.setX( -signsA.getX() ); scalesA.setX( -scalesA.getX() ); - distSqr = EdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -693,7 +697,7 @@ EdgeEdgeTests( signsB.setY( -signsB.getY() ); scalesB.setY( -scalesB.getY() ); - distSqr = EdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -706,7 +710,7 @@ EdgeEdgeTests( signsA.setX( -signsA.getX() ); scalesA.setX( -scalesA.getX() ); - distSqr = EdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -721,7 +725,7 @@ EdgeEdgeTests( signsB.setY( -signsB.getY() ); scalesB.setY( -scalesB.getY() ); - distSqr = EdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -734,7 +738,7 @@ EdgeEdgeTests( signsA.setY( -signsA.getY() ); scalesA.setY( -scalesA.getY() ); - distSqr = EdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -747,7 +751,7 @@ EdgeEdgeTests( signsB.setY( -signsB.getY() ); scalesB.setY( -scalesB.getY() ); - distSqr = EdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -760,7 +764,7 @@ EdgeEdgeTests( signsA.setY( -signsA.getY() ); scalesA.setY( -scalesA.getY() ); - distSqr = EdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, + distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA, matrixAB, matrixBA, signsA, signsB, scalesA, scalesB ); if ( distSqr < minDistSqr ) { @@ -768,27 +772,25 @@ EdgeEdgeTests( } } + float -boxBoxDistance( - Vector3& normal, - BoxPoint& boxPointA, - BoxPoint& boxPointB, - PE_REF(Box) boxA, const Transform3& transformA, - PE_REF(Box) boxB, const Transform3& transformB, - float distanceThreshold ) +boxBoxDistance(vmVector3& normal, BoxPoint& boxPointA, BoxPoint& boxPointB, + PE_REF(Box) boxA, const vmTransform3 & transformA, PE_REF(Box) boxB, + const vmTransform3 & transformB, + float distanceThreshold) { - Matrix3 identity; - identity = Matrix3::identity(); - Vector3 ident[3]; + vmMatrix3 identity; + identity = vmMatrix3::identity(); + vmVector3 ident[3]; ident[0] = identity.getCol0(); ident[1] = identity.getCol1(); ident[2] = identity.getCol2(); // get relative transformations - Transform3 transformAB, transformBA; - Matrix3 matrixAB, matrixBA; - Vector3 offsetAB, offsetBA; + vmTransform3 transformAB, transformBA; + vmMatrix3 matrixAB, matrixBA; + vmVector3 offsetAB, offsetBA; transformAB = orthoInverse(transformA) * transformB; transformBA = orthoInverse(transformAB); @@ -798,25 +800,25 @@ boxBoxDistance( matrixBA = transformBA.getUpper3x3(); offsetBA = transformBA.getTranslation(); - Matrix3 absMatrixAB = absPerElem(matrixAB); - Matrix3 absMatrixBA = absPerElem(matrixBA); + vmMatrix3 absMatrixAB = absPerElem(matrixAB); + vmMatrix3 absMatrixBA = absPerElem(matrixBA); // find separating axis with largest gap between projections BoxSepAxisType axisType; - Vector3 axisA(0.0f), axisB(0.0f); + vmVector3 axisA(0.0f), axisB(0.0f); float gap, maxGap; int faceDimA = 0, faceDimB = 0, edgeDimA = 0, edgeDimB = 0; // face axes - Vector3 gapsA = absPerElem(offsetAB) - boxA.half - absMatrixAB * boxB.half; + vmVector3 gapsA = absPerElem(offsetAB) - boxA.mHalf - absMatrixAB * boxB.mHalf; AaxisTest(0,X,true); AaxisTest(1,Y,false); AaxisTest(2,Z,false); - Vector3 gapsB = absPerElem(offsetBA) - boxB.half - absMatrixBA * boxA.half; + vmVector3 gapsB = absPerElem(offsetBA) - boxB.mHalf - absMatrixBA * boxA.mHalf; BaxisTest(0,X); BaxisTest(1,Y); @@ -825,10 +827,10 @@ boxBoxDistance( // cross product axes // ŠOÏ‚ª‚O‚̂Ƃ«‚Ì‘Îô - absMatrixAB += Matrix3(1.0e-5f); - absMatrixBA += Matrix3(1.0e-5f); + absMatrixAB += vmMatrix3(1.0e-5f); + absMatrixBA += vmMatrix3(1.0e-5f); - Matrix3 lsqrs, projOffset, projAhalf, projBhalf; + vmMatrix3 lsqrs, projOffset, projAhalf, projBhalf; lsqrs.setCol0( mulPerElem( matrixBA.getCol2(), matrixBA.getCol2() ) + mulPerElem( matrixBA.getCol1(), matrixBA.getCol1() ) ); @@ -841,15 +843,15 @@ boxBoxDistance( projOffset.setCol1(matrixBA.getCol2() * offsetAB.getX() - matrixBA.getCol0() * offsetAB.getZ()); projOffset.setCol2(matrixBA.getCol0() * offsetAB.getY() - matrixBA.getCol1() * offsetAB.getX()); - projAhalf.setCol0(absMatrixBA.getCol1() * boxA.half.getZ() + absMatrixBA.getCol2() * boxA.half.getY()); - projAhalf.setCol1(absMatrixBA.getCol2() * boxA.half.getX() + absMatrixBA.getCol0() * boxA.half.getZ()); - projAhalf.setCol2(absMatrixBA.getCol0() * boxA.half.getY() + absMatrixBA.getCol1() * boxA.half.getX()); + projAhalf.setCol0(absMatrixBA.getCol1() * boxA.mHalf.getZ() + absMatrixBA.getCol2() * boxA.mHalf.getY()); + projAhalf.setCol1(absMatrixBA.getCol2() * boxA.mHalf.getX() + absMatrixBA.getCol0() * boxA.mHalf.getZ()); + projAhalf.setCol2(absMatrixBA.getCol0() * boxA.mHalf.getY() + absMatrixBA.getCol1() * boxA.mHalf.getX()); - projBhalf.setCol0(absMatrixAB.getCol1() * boxB.half.getZ() + absMatrixAB.getCol2() * boxB.half.getY()); - projBhalf.setCol1(absMatrixAB.getCol2() * boxB.half.getX() + absMatrixAB.getCol0() * boxB.half.getZ()); - projBhalf.setCol2(absMatrixAB.getCol0() * boxB.half.getY() + absMatrixAB.getCol1() * boxB.half.getX()); + projBhalf.setCol0(absMatrixAB.getCol1() * boxB.mHalf.getZ() + absMatrixAB.getCol2() * boxB.mHalf.getY()); + projBhalf.setCol1(absMatrixAB.getCol2() * boxB.mHalf.getX() + absMatrixAB.getCol0() * boxB.mHalf.getZ()); + projBhalf.setCol2(absMatrixAB.getCol0() * boxB.mHalf.getY() + absMatrixAB.getCol1() * boxB.mHalf.getX()); - Matrix3 gapsAxB = absPerElem(projOffset) - projAhalf - transpose(projBhalf); + vmMatrix3 gapsAxB = absPerElem(projOffset) - projAhalf - transpose(projBhalf); CrossAxisTest(0,0,X); CrossAxisTest(0,1,Y); @@ -872,7 +874,7 @@ boxBoxDistance( axisA = -axisA; axisB = matrixBA * -axisA; - Vector3 absAxisB = Vector3(absPerElem(axisB)); + vmVector3 absAxisB = vmVector3(absPerElem(axisB)); if ( ( absAxisB[0] > absAxisB[1] ) && ( absAxisB[0] > absAxisB[2] ) ) faceDimB = 0; @@ -885,7 +887,7 @@ boxBoxDistance( axisB = -axisB; axisA = matrixAB * -axisB; - Vector3 absAxisA = Vector3(absPerElem(axisA)); + vmVector3 absAxisA = vmVector3(absPerElem(axisA)); if ( ( absAxisA[0] > absAxisA[1] ) && ( absAxisA[0] > absAxisA[2] ) ) faceDimA = 0; @@ -900,8 +902,8 @@ boxBoxDistance( axisA = -axisA; axisB = matrixBA * -axisA; - Vector3 absAxisA = Vector3(absPerElem(axisA)); - Vector3 absAxisB = Vector3(absPerElem(axisB)); + vmVector3 absAxisA = vmVector3(absPerElem(axisA)); + vmVector3 absAxisB = vmVector3(absPerElem(axisB)); dimA[1] = edgeDimA; dimB[1] = edgeDimB; @@ -966,7 +968,7 @@ boxBoxDistance( dimB[1] = (faceDimB+2)%3; } - Matrix3 aperm_col, bperm_col; + vmMatrix3 aperm_col, bperm_col; aperm_col.setCol0(ident[dimA[0]]); aperm_col.setCol1(ident[dimA[1]]); @@ -976,32 +978,32 @@ boxBoxDistance( bperm_col.setCol1(ident[dimB[1]]); bperm_col.setCol2(ident[dimB[2]]); - Matrix3 aperm_row, bperm_row; + vmMatrix3 aperm_row, bperm_row; aperm_row = transpose(aperm_col); bperm_row = transpose(bperm_col); // permute all box parameters to be in the face coordinate systems - Matrix3 matrixAB_perm = aperm_row * matrixAB * bperm_col; - Matrix3 matrixBA_perm = transpose(matrixAB_perm); + vmMatrix3 matrixAB_perm = aperm_row * matrixAB * bperm_col; + vmMatrix3 matrixBA_perm = transpose(matrixAB_perm); - Vector3 offsetAB_perm, offsetBA_perm; + vmVector3 offsetAB_perm, offsetBA_perm; offsetAB_perm = aperm_row * offsetAB; offsetBA_perm = bperm_row * offsetBA; - Vector3 halfA_perm, halfB_perm; + vmVector3 halfA_perm, halfB_perm; - halfA_perm = aperm_row * boxA.half; - halfB_perm = bperm_row * boxB.half; + halfA_perm = aperm_row * boxA.mHalf; + halfB_perm = bperm_row * boxB.mHalf; // compute the vector between the centers of each face, in each face's coordinate frame - Vector3 signsA_perm, signsB_perm, scalesA_perm, scalesB_perm, faceOffsetAB_perm, faceOffsetBA_perm; + vmVector3 signsA_perm, signsB_perm, scalesA_perm, scalesB_perm, faceOffsetAB_perm, faceOffsetBA_perm; - signsA_perm = copySignPerElem(Vector3(1.0f),aperm_row * axisA); - signsB_perm = copySignPerElem(Vector3(1.0f),bperm_row * axisB); + signsA_perm = copySignPerElem(vmVector3(1.0f),aperm_row * axisA); + signsB_perm = copySignPerElem(vmVector3(1.0f),bperm_row * axisB); scalesA_perm = mulPerElem( signsA_perm, halfA_perm ); scalesB_perm = mulPerElem( signsB_perm, halfB_perm ); @@ -1031,11 +1033,11 @@ boxBoxDistance( // if for some reason no case passes the Voronoi test, the features with the minimum // distance are returned. - Point3 localPointA_perm, localPointB_perm; + vmPoint3 localPointA_perm, localPointB_perm; float minDistSqr; bool done; - Vector3 hA_perm( halfA_perm ), hB_perm( halfB_perm ); + vmVector3 hA_perm( halfA_perm ), hB_perm( halfB_perm ); localPointA_perm.setZ( scalesA_perm.getZ() ); localPointB_perm.setZ( scalesB_perm.getZ() ); @@ -1109,9 +1111,11 @@ boxBoxDistance( // convert local points from face-local to box-local coordinate system - boxPointA.localPoint = Point3( aperm_col * Vector3( localPointA_perm ) ); - boxPointB.localPoint = Point3( bperm_col * Vector3( localPointB_perm ) ); + + boxPointA.localPoint = vmPoint3( aperm_col * vmVector3( localPointA_perm )) ; + boxPointB.localPoint = vmPoint3( bperm_col * vmVector3( localPointB_perm )) ; +#if 0 // find which features of the boxes are involved. // the only feature pairs which occur in this function are VF, FV, and EE, even though the // closest points might actually lie on sub-features, as in a VF contact might be used for @@ -1144,6 +1148,7 @@ boxBoxDistance( } else { boxPointB.setVertexFeature( sB[0], sB[1], sB[2] ); } +#endif normal = transformA * axisA; diff --git a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h index c58e257c0..0d4957dea 100644 --- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h +++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h @@ -21,7 +21,6 @@ subject to the following restrictions: #include "Box.h" -using namespace Vectormath::Aos; //--------------------------------------------------------------------------- // boxBoxDistance: @@ -38,7 +37,7 @@ using namespace Vectormath::Aos; // positive or negative distance between two boxes. // // args: -// Vector3& normal: set to a unit contact normal pointing from box A to box B. +// vmVector3& normal: set to a unit contact normal pointing from box A to box B. // // BoxPoint& boxPointA, BoxPoint& boxPointB: // set to a closest point or point of penetration on each box. @@ -46,7 +45,7 @@ using namespace Vectormath::Aos; // Box boxA, Box boxB: // boxes, represented as 3 half-widths // -// const Transform3& transformA, const Transform3& transformB: +// const vmTransform3& transformA, const vmTransform3& transformB: // box transformations, in world coordinates // // float distanceThreshold: @@ -58,9 +57,9 @@ using namespace Vectormath::Aos; //--------------------------------------------------------------------------- float -boxBoxDistance(Vector3& normal, BoxPoint& boxPointA, BoxPoint& boxPointB, - PE_REF(Box) boxA, const Transform3 & transformA, PE_REF(Box) boxB, - const Transform3 & transformB, +boxBoxDistance(vmVector3& normal, BoxPoint& boxPointA, BoxPoint& boxPointB, + PE_REF(Box) boxA, const vmTransform3 & transformA, PE_REF(Box) boxB, + const vmTransform3 & transformB, float distanceThreshold = FLT_MAX ); #endif /* __BOXBOXDISTANCE_H__ */ diff --git a/src/BulletMultiThreaded/TrbDynBody.h b/src/BulletMultiThreaded/TrbDynBody.h new file mode 100644 index 000000000..d65c22e69 --- /dev/null +++ b/src/BulletMultiThreaded/TrbDynBody.h @@ -0,0 +1,78 @@ +/* + Copyright (C) 2009 Sony Computer Entertainment Inc. + All rights reserved. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. + +*/ + +#ifndef __T_RB_DYN_BODY_H__ +#define __T_RB_DYN_BODY_H__ + +#include +using namespace Vectormath::Aos; + +#include "TrbStateVec.h" + +class CollObject; + +class TrbDynBody +{ +public: + TrbDynBody() + { + fMass = 0.0f; + fCollObject = NULL; + fElasticity = 0.2f; + fFriction = 0.8f; + } + + // Get methods + float getMass() const {return fMass;}; + float getElasticity() const {return fElasticity;} + float getFriction() const {return fFriction;} + CollObject* getCollObject() const {return fCollObject;} + const Matrix3 &getBodyInertia() const {return fIBody;} + const Matrix3 &getBodyInertiaInv() const {return fIBodyInv;} + float getMassInv() const {return fMassInv;} + + // Set methods + void setMass(float mass) {fMass=mass;fMassInv=mass>0.0f?1.0f/mass:0.0f;} + void setBodyInertia(const Matrix3 bodyInertia) {fIBody = bodyInertia;fIBodyInv = inverse(bodyInertia);} + void setElasticity(float elasticity) {fElasticity = elasticity;} + void setFriction(float friction) {fFriction = friction;} + void setCollObject(CollObject *collObj) {fCollObject = collObj;} + + void setBodyInertiaInv(const Matrix3 bodyInertiaInv) + { + fIBody = inverse(bodyInertiaInv); + fIBodyInv = bodyInertiaInv; + } + void setMassInv(float invMass) { + fMass= invMass>0.0f ? 1.0f/invMass :0.0f; + fMassInv=invMass; + } + + +private: + // Rigid Body constants + float fMass; // Rigid Body mass + float fMassInv; // Inverse of mass + Matrix3 fIBody; // Inertia matrix in body's coords + Matrix3 fIBodyInv; // Inertia matrix inverse in body's coords + float fElasticity; // Coefficient of restitution + float fFriction; // Coefficient of friction + +public: + CollObject* fCollObject; // Collision object corresponding the RB +} __attribute__ ((aligned(16))); + +#endif /* __T_RB_DYN_BODY_H__ */ diff --git a/src/BulletMultiThreaded/TrbStateVec.h b/src/BulletMultiThreaded/TrbStateVec.h new file mode 100644 index 000000000..8a76a33d0 --- /dev/null +++ b/src/BulletMultiThreaded/TrbStateVec.h @@ -0,0 +1,334 @@ +/* + Copyright (C) 2009 Sony Computer Entertainment Inc. + All rights reserved. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. + +*/ + +#ifndef __TRBSTATEVEC_H__ +#define __TRBSTATEVEC_H__ + +#include +#include + + +#include "PlatformDefinitions.h" + + +static inline vmVector3 read_Vector3(const float* p) +{ + vmVector3 v; + loadXYZ(v, p); + return v; +} + +static inline vmQuat read_Quat(const float* p) +{ + vmQuat vq; + loadXYZW(vq, p); + return vq; +} + +static inline void store_Vector3(const vmVector3 &src, float* p) +{ + vmVector3 v = src; + storeXYZ(v, p); +} + +static inline void store_Quat(const vmQuat &src, float* p) +{ + vmQuat vq = src; + storeXYZW(vq, p); +} + +// Motion Type +enum { + PfxMotionTypeFixed = 0, + PfxMotionTypeActive, + PfxMotionTypeKeyframe, + PfxMotionTypeOneWay, + PfxMotionTypeTrigger, + PfxMotionTypeCount +}; + +#define PFX_MOTION_MASK_DYNAMIC 0x0a // Active,OneWay +#define PFX_MOTION_MASK_STATIC 0x95 // Fixed,Keyframe,Trigger,Sleeping +#define PFX_MOTION_MASK_SLEEP 0x0e // Can sleep +#define PFX_MOTION_MASK_TYPE 0x7f + +// +// Rigid Body state +// + +#ifdef __CELLOS_LV2__ +ATTRIBUTE_ALIGNED128(class) TrbState +#else +ATTRIBUTE_ALIGNED16(class) TrbState +#endif + +{ +public: + TrbState() + { + setMotionType(PfxMotionTypeActive); + contactFilterSelf=contactFilterTarget=0xffffffff; + deleted = 0; + mSleeping = 0; + useSleep = 1; + trbBodyIdx=0; + mSleepCount=0; + useCcd = 0; + useContactCallback = 0; + useSleepCallback = 0; + linearDamping = 1.0f; + angularDamping = 0.99f; + } + + TrbState(const uint8_t m, const vmVector3& x, const vmQuat& q, const vmVector3& v, const vmVector3& omega ); + + uint16_t mSleepCount; + uint8_t mMotionType; + uint8_t deleted : 1; + uint8_t mSleeping : 1; + uint8_t useSleep : 1; + uint8_t useCcd : 1; + uint8_t useContactCallback : 1; + uint8_t useSleepCallback : 1; + + uint16_t trbBodyIdx; + uint32_t contactFilterSelf; + uint32_t contactFilterTarget; + + float center[3]; // AABB center(World) + float half[3]; // AABB half(World) + + float linearDamping; + float angularDamping; + + float deltaLinearVelocity[3]; + float deltaAngularVelocity[3]; + + float fX[3]; // position + float fQ[4]; // orientation + float fV[3]; // velocity + float fOmega[3]; // angular velocity + + inline void setZero(); // Zeroes out the elements + inline void setIdentity(); // Sets the rotation to identity and zeroes out the other elements + + bool isDeleted() const {return deleted==1;} + + uint16_t getRigidBodyId() const {return trbBodyIdx;} + void setRigidBodyId(uint16_t i) {trbBodyIdx = i;} + + + uint32_t getContactFilterSelf() const {return contactFilterSelf;} + void setContactFilterSelf(uint32_t filter) {contactFilterSelf = filter;} + + uint32_t getContactFilterTarget() const {return contactFilterTarget;} + void setContactFilterTarget(uint32_t filter) {contactFilterTarget = filter;} + + float getLinearDamping() const {return linearDamping;} + float getAngularDamping() const {return angularDamping;} + + void setLinearDamping(float damping) {linearDamping=damping;} + void setAngularDamping(float damping) {angularDamping=damping;} + + + uint8_t getMotionType() const {return mMotionType;} + void setMotionType(uint8_t t) {mMotionType = t;mSleeping=0;mSleepCount=0;} + + uint8_t getMotionMask() const {return (1< c.m_upperLimit) + { + deltaImpulse = c.m_upperLimit-c.m_appliedImpulse; + c.m_appliedImpulse = c.m_upperLimit; + } + else + { + c.m_appliedImpulse = sum; + } + + + if (body1.mMassInv) + { + btVector3 linearComponent = c.m_contactNormal*body1.mMassInv; + ((btVector3&)body1.mDeltaLinearVelocity) += linearComponent*deltaImpulse; + ((btVector3&)body1.mDeltaAngularVelocity) += c.m_angularComponentA*(btVector3(deltaImpulse,deltaImpulse,deltaImpulse));//*m_angularFactor); + } + + if (body2.mMassInv) + { + btVector3 linearComponent = -c.m_contactNormal*body2.mMassInv; + ((btVector3&)body2.mDeltaLinearVelocity) += linearComponent*deltaImpulse; + ((btVector3&)body2.mDeltaAngularVelocity) += c.m_angularComponentB*((btVector3(deltaImpulse,deltaImpulse,deltaImpulse)));//*m_angularFactor); + } + + //body1.internalApplyImpulse(c.m_contactNormal*body1.internalGetInvMass(),c.m_angularComponentA,deltaImpulse); + //body2.internalApplyImpulse(-c.m_contactNormal*body2.internalGetInvMass(),c.m_angularComponentB,deltaImpulse); + +} + + +static SIMD_FORCE_INLINE +void pfxSolveLinearConstraintRow(PfxConstraintRow &constraint, + vmVector3 &deltaLinearVelocityA,vmVector3 &deltaAngularVelocityA, + float massInvA,const vmMatrix3 &inertiaInvA,const vmVector3 &rA, + vmVector3 &deltaLinearVelocityB,vmVector3 &deltaAngularVelocityB, + float massInvB,const vmMatrix3 &inertiaInvB,const vmVector3 &rB) +{ + const vmVector3 normal(btReadVector3(constraint.mNormal)); + float deltaImpulse = constraint.mRhs; + vmVector3 dVA = deltaLinearVelocityA + cross(deltaAngularVelocityA,rA); + vmVector3 dVB = deltaLinearVelocityB + cross(deltaAngularVelocityB,rB); + deltaImpulse -= constraint.mJacDiagInv * dot(normal,dVA-dVB); + float oldImpulse = constraint.mAccumImpulse; + constraint.mAccumImpulse = btClamped(oldImpulse + deltaImpulse,constraint.mLowerLimit,constraint.mUpperLimit); + deltaImpulse = constraint.mAccumImpulse - oldImpulse; + deltaLinearVelocityA += deltaImpulse * massInvA * normal; + deltaAngularVelocityA += deltaImpulse * inertiaInvA * cross(rA,normal); + deltaLinearVelocityB -= deltaImpulse * massInvB * normal; + deltaAngularVelocityB -= deltaImpulse * inertiaInvB * cross(rB,normal); + +} + +void btSolveContactConstraint( + PfxConstraintRow &constraintResponse, + PfxConstraintRow &constraintFriction1, + PfxConstraintRow &constraintFriction2, + const vmVector3 &contactPointA, + const vmVector3 &contactPointB, + PfxSolverBody &solverBodyA, + PfxSolverBody &solverBodyB, + float friction + ) +{ + vmVector3 rA = rotate(solverBodyA.mOrientation,contactPointA); + vmVector3 rB = rotate(solverBodyB.mOrientation,contactPointB); + + pfxSolveLinearConstraintRow(constraintResponse, + solverBodyA.mDeltaLinearVelocity,solverBodyA.mDeltaAngularVelocity,solverBodyA.mMassInv,solverBodyA.mInertiaInv,rA, + solverBodyB.mDeltaLinearVelocity,solverBodyB.mDeltaAngularVelocity,solverBodyB.mMassInv,solverBodyB.mInertiaInv,rB); + + float mf = friction*fabsf(constraintResponse.mAccumImpulse); + constraintFriction1.mLowerLimit = -mf; + constraintFriction1.mUpperLimit = mf; + constraintFriction2.mLowerLimit = -mf; + constraintFriction2.mUpperLimit = mf; + + pfxSolveLinearConstraintRow(constraintFriction1, + solverBodyA.mDeltaLinearVelocity,solverBodyA.mDeltaAngularVelocity,solverBodyA.mMassInv,solverBodyA.mInertiaInv,rA, + solverBodyB.mDeltaLinearVelocity,solverBodyB.mDeltaAngularVelocity,solverBodyB.mMassInv,solverBodyB.mInertiaInv,rB); + + pfxSolveLinearConstraintRow(constraintFriction2, + solverBodyA.mDeltaLinearVelocity,solverBodyA.mDeltaAngularVelocity,solverBodyA.mMassInv,solverBodyA.mInertiaInv,rA, + solverBodyB.mDeltaLinearVelocity,solverBodyB.mDeltaAngularVelocity,solverBodyB.mMassInv,solverBodyB.mInertiaInv,rB); +} + + +void CustomSolveConstraintsTaskParallel( + const PfxParallelGroup *contactParallelGroup,const PfxParallelBatch *contactParallelBatches, + PfxConstraintPair *contactPairs,uint32_t numContactPairs, + btPersistentManifold* offsetContactManifolds__, + const PfxParallelGroup *jointParallelGroup,const PfxParallelBatch *jointParallelBatches, + PfxConstraintPair *jointPairs,uint32_t numJointPairs, + TrbState *offsetRigStates, + PfxSolverBody *offsetSolverBodies, + uint32_t numRigidBodies, + int iteration,unsigned int taskId,unsigned int numTasks,btBarrier *barrier) +{ + + PfxSolverBody staticBody; + staticBody.mMassInv = 0.f; + staticBody.mDeltaAngularVelocity=vmVector3(0,0,0); + staticBody.mDeltaLinearVelocity =vmVector3(0,0,0); + + + for(int k=0;knumPhases;phaseId++) { + for(uint32_t batchId=0;batchIdnumBatches[phaseId];batchId++) { + uint32_t numPairs = jointParallelGroup->numPairs[phaseId*PFX_MAX_SOLVER_BATCHES+batchId]; + if(batchId%numTasks == taskId && numPairs > 0) { + const PfxParallelBatch &batch = jointParallelBatches[phaseId*PFX_MAX_SOLVER_BATCHES+batchId]; + for(int i=0;isync(); + } + + // Contact + for(uint32_t phaseId=0;phaseIdnumPhases;phaseId++) { + for(uint32_t batchId=0;batchIdnumBatches[phaseId];batchId++) { + uint32_t numPairs = contactParallelGroup->numPairs[phaseId*PFX_MAX_SOLVER_BATCHES+batchId]; + if(batchId%numTasks == taskId && numPairs > 0) { + const PfxParallelBatch &batch = contactParallelBatches[phaseId*PFX_MAX_SOLVER_BATCHES+batchId]; + for(int i=0;isync(); + } + } +} + +void CustomPostSolverTask( + TrbState *states, + PfxSolverBody *solverBodies, + uint32_t numRigidBodies) +{ + for(int i=0;i 0.707f) { + // choose p in y-z plane + float a = n[1]*n[1] + n[2]*n[2]; + float k = 1.0f/sqrtf(a); + p[0] = 0; + p[1] = -n[2]*k; + p[2] = n[1]*k; + // set q = n x p + q[0] = a*k; + q[1] = -n[0]*p[2]; + q[2] = n[0]*p[1]; + } + else { + // choose p in x-y plane + float a = n[0]*n[0] + n[1]*n[1]; + float k = 1.0f/sqrtf(a); + p[0] = -n[1]*k; + p[1] = n[0]*k; + p[2] = 0; + // set q = n x p + q[0] = -n[2]*p[1]; + q[1] = n[2]*p[0]; + q[2] = a*k; + } +} + + + +#define PFX_CONTACT_SLOP 0.001f + +void btSetupContactConstraint( + PfxConstraintRow &constraintResponse, + PfxConstraintRow &constraintFriction1, + PfxConstraintRow &constraintFriction2, + float penetrationDepth, + float restitution, + float friction, + const vmVector3 &contactNormal, + const vmVector3 &contactPointA, + const vmVector3 &contactPointB, + const TrbState &stateA, + const TrbState &stateB, + PfxSolverBody &solverBodyA, + PfxSolverBody &solverBodyB, + float separateBias, + float timeStep + ) +{ + vmVector3 rA = rotate(solverBodyA.mOrientation,contactPointA); + vmVector3 rB = rotate(solverBodyB.mOrientation,contactPointB); + + vmMatrix3 K = vmMatrix3::scale(vmVector3(solverBodyA.mMassInv + solverBodyB.mMassInv)) - + crossMatrix(rA) * solverBodyA.mInertiaInv * crossMatrix(rA) - + crossMatrix(rB) * solverBodyB.mInertiaInv * crossMatrix(rB); + + vmVector3 vA = stateA.getLinearVelocity() + cross(stateA.getAngularVelocity(),rA); + vmVector3 vB = stateB.getLinearVelocity() + cross(stateB.getAngularVelocity(),rB); + vmVector3 vAB = vA-vB; + + vmVector3 tangent1,tangent2; + btPlaneSpace1(contactNormal,tangent1,tangent2); + +// constraintResponse.mAccumImpulse = 0.f; +// constraintFriction1.mAccumImpulse = 0.f; +// constraintFriction2.mAccumImpulse = 0.f; + + // Contact Constraint + { + vmVector3 normal = contactNormal; + + float denom = dot(K*normal,normal); + + constraintResponse.mRhs = -(1.0f+restitution)*dot(vAB,normal); // velocity error + constraintResponse.mRhs -= (separateBias * btMin(0.0f,penetrationDepth+PFX_CONTACT_SLOP)) / timeStep; // position error + constraintResponse.mRhs /= denom; + constraintResponse.mJacDiagInv = 1.0f/denom; + constraintResponse.mLowerLimit = 0.0f; + constraintResponse.mUpperLimit = SIMD_INFINITY; + btStoreVector3(normal,constraintResponse.mNormal); + } + + // Friction Constraint 1 + { + vmVector3 normal = tangent1; + + float denom = dot(K*normal,normal); + + constraintFriction1.mJacDiagInv = 1.0f/denom; + constraintFriction1.mRhs = -dot(vAB,normal); + constraintFriction1.mRhs *= constraintFriction1.mJacDiagInv; + constraintFriction1.mLowerLimit = 0.0f; + constraintFriction1.mUpperLimit = SIMD_INFINITY; + btStoreVector3(normal,constraintFriction1.mNormal); + } + + // Friction Constraint 2 + { + vmVector3 normal = tangent2; + + float denom = dot(K*normal,normal); + + constraintFriction2.mJacDiagInv = 1.0f/denom; + constraintFriction2.mRhs = -dot(vAB,normal); + constraintFriction2.mRhs *= constraintFriction2.mJacDiagInv; + constraintFriction2.mLowerLimit = 0.0f; + constraintFriction2.mUpperLimit = SIMD_INFINITY; + btStoreVector3(normal,constraintFriction2.mNormal); + } +} + + +void CustomSetupContactConstraintsTask( + PfxConstraintPair *contactPairs,uint32_t numContactPairs, + TrbState *offsetRigStates, + PfxSolverBody *offsetSolverBodies, + uint32_t numRigidBodies, + float separateBias, + float timeStep) +{ + for(int i=0;i 1) restitution = 0.0f; + + float friction = sqrtf(solverBodyA.friction * solverBodyB.friction); + + for(int j=0;jio); + btCriticalSection* criticalsection = io->setupContactConstraints.criticalSection; + + + //CustomCriticalSection *criticalsection = &io->m_cs; + switch(io->cmd) { + + case PFX_CONSTRAINT_SOLVER_CMD_SOLVE_CONSTRAINTS: + CustomSolveConstraintsTaskParallel( + io->solveConstraints.contactParallelGroup, + io->solveConstraints.contactParallelBatches, + io->solveConstraints.contactPairs, + io->solveConstraints.numContactPairs, + io->solveConstraints.offsetContactManifolds, + + io->solveConstraints.jointParallelGroup, + io->solveConstraints.jointParallelBatches, + io->solveConstraints.jointPairs, + io->solveConstraints.numJointPairs, + + io->solveConstraints.offsetRigStates, + io->solveConstraints.offsetSolverBodies, + io->solveConstraints.numRigidBodies, + io->solveConstraints.iteration, + + io->solveConstraints.taskId, + io->maxTasks1, + io->solveConstraints.barrier + ); + break; + + case PFX_CONSTRAINT_SOLVER_CMD_POST_SOLVER: + CustomPostSolverTask( io->postSolver.states,io->postSolver.solverBodies, io->postSolver.numRigidBodies); + break; + + + case PFX_CONSTRAINT_SOLVER_CMD_SETUP_CONTACT_CONSTRAINTS: + { + bool empty = false; + while(!empty) { + int start,batch; + + criticalsection->lock(); + + start = (int)criticalsection->getSharedParam(0); + batch = (int)criticalsection->getSharedParam(1); + + //PFX_PRINTF("taskId %d start %d num %d\n",arg->taskId,start,batch); + + // ŽŸ‚̃oƒbƒtƒ@‚ðƒZƒbƒg + int nextStart = start + batch; + int rest = btMax((int)io->setupContactConstraints.numContactPairs1 - nextStart,0); + int nextBatch = (rest > batch)?batch:rest; + + criticalsection->setSharedParam(0,nextStart); + criticalsection->setSharedParam(1,nextBatch); + + criticalsection->unlock(); + + if(batch > 0) { + CustomSetupContactConstraintsTask( + io->setupContactConstraints.offsetContactPairs+start,batch, +// io->setupContactConstraints.offsetContactManifolds, + io->setupContactConstraints.offsetRigStates, +// io->setupContactConstraints.offsetRigBodies, + io->setupContactConstraints.offsetSolverBodies, + io->setupContactConstraints.numRigidBodies, + io->setupContactConstraints.separateBias, + io->setupContactConstraints.timeStep); + } + else { + empty = true; + } + } + } + break; + + default: + { + btAssert(0); + } + } + +} + + +void CustomSetupContactConstraintsNew( + PfxConstraintPair *contactPairs1,uint32_t numContactPairs, + btPersistentManifold *offsetContactManifolds, + TrbState *offsetRigStates, + PfxSolverBody *offsetSolverBodies, + uint32_t numRigidBodies, + float separationBias, + float timeStep, + class btThreadSupportInterface* threadSupport, + btCriticalSection* criticalSection, + btConstraintSolverIO *io + ) +{ + int maxTasks = threadSupport->getNumTasks(); + + int div = (int)maxTasks * 4; + int batch = ((int)numContactPairs + div - 1) / div; +#ifdef __PPU__ + BulletPE2ConstraintSolverSpursSupport* spursThread = (BulletPE2ConstraintSolverSpursSupport*) threadSupport; +#endif + if (criticalSection) + { + criticalSection->setSharedParam(0,0); + criticalSection->setSharedParam(1,btMin(batch,64)); // batched number + } else + { +#ifdef __PPU__ + spursThread->setSharedParam(0,0); + spursThread->setSharedParam(1,btMin(batch,64)); // batched number +#endif //__PPU__ + } + + for(int t=0;tgetBarrierAddress(); + io[t].criticalsectionAddr2 = (unsigned int)spursThread->getCriticalSectionAddress(); +#endif + + +//#define SEQUENTIAL_SETUP +#ifdef SEQUENTIAL_SETUP + CustomSetupContactConstraintsTask(contactPairs1,numContactPairs,offsetRigStates,offsetSolverBodies,numRigidBodies,separationBias,timeStep); +#else + threadSupport->sendRequest(1,(ppu_address_t)&io[t],t); +#endif + + } +#ifndef SEQUENTIAL_SETUP + unsigned int arg0,arg1; + for(int t=0;twaitForResponse(&arg0,&arg1); + } +#endif //SEQUENTIAL_SETUP + +} + + +void CustomSplitConstraints( + PfxConstraintPair *pairs,uint32_t numPairs, + PfxParallelGroup &group,PfxParallelBatch *batches, + uint32_t numTasks, + uint32_t numRigidBodies, + void *poolBuff, + uint32_t poolBytes + ) +{ + HeapManager pool((unsigned char*)poolBuff,poolBytes); + + // ƒXƒe[ƒgƒ`ƒFƒbƒN—pƒrƒbƒgƒtƒ‰ƒOƒe[ƒuƒ‹ + int bufSize = sizeof(uint8_t)*numRigidBodies; + bufSize = ((bufSize+127)>>7)<<7; // 128 bytes alignment + uint8_t *bodyTable = (uint8_t*)pool.allocate(bufSize,HeapManager::ALIGN128); + + // ƒyƒAƒ`ƒFƒbƒN—pƒrƒbƒgƒtƒ‰ƒOƒe[ƒuƒ‹ + uint32_t *pairTable; + size_t allocSize = sizeof(uint32_t)*((numPairs+31)/32); + pairTable = (uint32_t*)pool.allocate(allocSize); + memset(pairTable,0,allocSize); + + // –Ú•W‚Æ‚·‚é•ªŠ„” + uint32_t targetCount = btMax(uint32_t(PFX_MIN_SOLVER_PAIRS),btMin(numPairs / (numTasks*2),uint32_t(PFX_MAX_SOLVER_PAIRS))); + uint32_t startIndex = 0; + + uint32_t phaseId; + uint32_t batchId; + uint32_t totalCount=0; + + uint32_t maxBatches = btMin(numTasks,uint32_t(PFX_MAX_SOLVER_BATCHES)); + + for(phaseId=0;phaseId>5; + uint32_t maskP = 1L << (i & 31); + + //pair is already assigned to a phase/batch + if(pairTable[idxP] & maskP) { + continue; + } + + uint32_t idxA = pfxGetRigidBodyIdA(pairs[i]); + uint32_t idxB = pfxGetRigidBodyIdB(pairs[i]); + + // —¼•û‚Æ‚àƒAƒNƒeƒBƒu‚łȂ¢A‚Ü‚½‚ÍÕ“Ë“_‚ª‚O‚̃yƒA‚Í“o˜^‘ÎÛ‚©‚ç‚Í‚¸‚· + if(!pfxGetActive(pairs[i]) || pfxGetNumConstraints(pairs[i]) == 0 || + ((pfxGetMotionMaskA(pairs[i])&PFX_MOTION_MASK_STATIC) && (pfxGetMotionMaskB(pairs[i])&PFX_MOTION_MASK_STATIC)) ) { + if(startIndexCheck) + startIndex++; + //assign pair -> skip it because it has no constraints + pairTable[idxP] |= maskP; + totalCount++; + continue; + } + + // ˆË‘¶«‚̃`ƒFƒbƒN + if( (bodyTable[idxA] != batchId && bodyTable[idxA] != 0xff) || + (bodyTable[idxB] != batchId && bodyTable[idxB] != 0xff) ) { + startIndexCheck = false; + //bodies of the pair are already assigned to another batch within this phase + continue; + } + + // ˆË‘¶«”»’èƒe[ƒuƒ‹‚É“o˜^ + if(pfxGetMotionMaskA(pairs[i])&PFX_MOTION_MASK_DYNAMIC) + bodyTable[idxA] = batchId; + if(pfxGetMotionMaskB(pairs[i])&PFX_MOTION_MASK_DYNAMIC) + bodyTable[idxB] = batchId; + + if(startIndexCheck) + startIndex++; + + pairTable[idxP] |= maskP; + //add the pair 'i' to the current batch + batch.pairIndices[pairId++] = i; + pairCount++; + } + + group.numPairs[phaseId*PFX_MAX_SOLVER_BATCHES+batchId] = (uint16_t)pairId; + totalCount += pairCount; + } + + group.numBatches[phaseId] = batchId; + } + + group.numPhases = phaseId; + + pool.clear(); +} + + + +void CustomSolveConstraintsParallel( + PfxConstraintPair *contactPairs,uint32_t numContactPairs, + + PfxConstraintPair *jointPairs,uint32_t numJointPairs, + + TrbState *offsetRigStates, + PfxSolverBody *offsetSolverBodies, + uint32_t numRigidBodies, + struct btConstraintSolverIO* io, + class btThreadSupportInterface* threadSupport, + int iteration, + void* poolBuf, + int poolBytes, + class btBarrier* barrier) + { + + int maxTasks = threadSupport->getNumTasks(); +// config.taskManager->setTaskEntry(PFX_SOLVER_ENTRY); + + HeapManager pool((unsigned char*)poolBuf,poolBytes); + + { + PfxParallelGroup *cgroup = (PfxParallelGroup*)pool.allocate(sizeof(PfxParallelGroup)); + PfxParallelBatch *cbatches = (PfxParallelBatch*)pool.allocate(sizeof(PfxParallelBatch)*(PFX_MAX_SOLVER_PHASES*PFX_MAX_SOLVER_BATCHES),128); + PfxParallelGroup *jgroup = (PfxParallelGroup*)pool.allocate(sizeof(PfxParallelGroup)); + PfxParallelBatch *jbatches = (PfxParallelBatch*)pool.allocate(sizeof(PfxParallelBatch)*(PFX_MAX_SOLVER_PHASES*PFX_MAX_SOLVER_BATCHES),128); + + uint32_t tmpBytes = poolBytes - 2 * (sizeof(PfxParallelGroup) + sizeof(PfxParallelBatch)*(PFX_MAX_SOLVER_PHASES*PFX_MAX_SOLVER_BATCHES) + 128); + void *tmpBuff = pool.allocate(tmpBytes); + + CustomSplitConstraints(contactPairs,numContactPairs,*cgroup,cbatches,maxTasks,numRigidBodies,tmpBuff,tmpBytes); + CustomSplitConstraints(jointPairs,numJointPairs,*jgroup,jbatches,maxTasks,numRigidBodies,tmpBuff,tmpBytes); + +//#define SOLVE_SEQUENTIAL +#ifdef SOLVE_SEQUENTIAL + CustomSolveConstraintsTask( + io->solveConstraints.contactParallelGroup, + io->solveConstraints.contactParallelBatches, + io->solveConstraints.contactPairs, + io->solveConstraints.numContactPairs, + io->solveConstraints.offsetContactManifolds, + + io->solveConstraints.jointParallelGroup, + io->solveConstraints.jointParallelBatches, + io->solveConstraints.jointPairs, + io->solveConstraints.numJointPairs, + io->solveConstraints.offsetJoints, + + io->solveConstraints.offsetRigStates, + io->solveConstraints.offsetSolverBodies, + io->solveConstraints.numRigidBodies, + io->solveConstraints.iteration,0,1,0);//arg->taskId,1,0);//,arg->maxTasks,arg->barrier); +#else + for(int t=0;tgetBarrierAddress(); + io[t].criticalsectionAddr2 = (unsigned int)spursThread->getCriticalSectionAddress(); +#endif + + threadSupport->sendRequest(1,(ppu_address_t)&io[t],t); + } + + unsigned int arg0,arg1; + for(int t=0;twaitForResponse(&arg0,&arg1); + } +#endif + pool.clear(); + } + + { + int batch = ((int)numRigidBodies + maxTasks - 1) / maxTasks; + int rest = (int)numRigidBodies; + int start = 0; + + for(int t=0;t 0 ? batch : rest; + io[t].cmd = PFX_CONSTRAINT_SOLVER_CMD_POST_SOLVER; + io[t].postSolver.states = offsetRigStates + start; + io[t].postSolver.solverBodies = offsetSolverBodies + start; + io[t].postSolver.numRigidBodies = (uint32_t)num; + io[t].maxTasks1 = maxTasks; +#ifdef __PPU__ + BulletPE2ConstraintSolverSpursSupport* spursThread = (BulletPE2ConstraintSolverSpursSupport*) threadSupport; + io[t].barrierAddr2 = (unsigned int)spursThread->getBarrierAddress(); + io[t].criticalsectionAddr2 = (unsigned int)spursThread->getCriticalSectionAddress(); +#endif + +#ifdef SOLVE_SEQUENTIAL + CustomPostSolverTask( io[t].postSolver.states,io[t].postSolver.solverBodies, io[t].postSolver.numRigidBodies); +#else + threadSupport->sendRequest(1,(ppu_address_t)&io[t],t); +#endif + rest -= num; + start += num; + } + + unsigned int arg0,arg1; + for(int t=0;twaitForResponse(&arg0,&arg1); +#endif + } + } + +} + + + +void BPE_customConstraintSolverSequentialNew(unsigned int new_num, PfxBroadphasePair *new_pairs1 , + TrbState* states,int numRigidBodies, + struct PfxSolverBody* solverBodies, + btPersistentManifold* contacts, + PfxConstraintPair* jointPairs, unsigned int numJoints, + float separateBias, + float timeStep, + int iteration, + btThreadSupportInterface* solverThreadSupport, + btCriticalSection* criticalSection, + struct btConstraintSolverIO* solverIO, + btBarrier* barrier + ) +{ + + { + BT_PROFILE("pfxSetupConstraints"); + + for(int i=0;i m_mystates; + btAlignedObjectArray m_mysolverbodies; + btAlignedObjectArray m_mypairs; + btAlignedObjectArray m_jointPairs; + +}; + +btConstraintSolverIO* createSolverIO(int numThreads) +{ + return new btConstraintSolverIO[numThreads]; +} + +btParallelConstraintSolver::btParallelConstraintSolver(btThreadSupportInterface* solverThreadSupport) +{ + + m_solverThreadSupport = solverThreadSupport;//createSolverThreadSupport(maxNumThreads); + m_solverIO = createSolverIO(m_solverThreadSupport->getNumTasks()); + + m_barrier = m_solverThreadSupport->createBarrier(); + m_criticalSection = m_solverThreadSupport->createCriticalSection(); + + m_memoryCache = new btParallelSolverMemoryCache(); } btParallelConstraintSolver::~btParallelConstraintSolver() { - //exit MiniCL - + delete m_memoryCache; } - -btScalar btParallelConstraintSolver::solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btStackAlloc* stackAlloc) + + +btScalar btParallelConstraintSolver::solveGroup(btCollisionObject** bodies1,int numRigidBodies,btPersistentManifold** manifoldPtr,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal, btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc,btDispatcher* dispatcher) { - { - int i; - btPersistentManifold* manifold = 0; -// btCollisionObject* colObj0=0,*colObj1=0; - - - for (i=0;im_mysolverbodies.resize(numRigidBodies); + m_memoryCache->m_mystates.resize(numRigidBodies); - btContactSolverInfo info = infoGlobal; - - - - int numConstraintPool = m_tmpSolverContactConstraintPool.size(); - int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size(); - - ///@todo: use stack allocator for such temporarily memory, same for solver bodies/constraints - m_orderTmpConstraintPool.resize(numConstraintPool); - m_orderFrictionConstraintPool.resize(numFrictionPool); { - int i; - for (i=0;isetCompanionId(i); + + PfxSolverBody& solverBody = m_memoryCache->m_mysolverbodies[i]; + btRigidBody* rb = btRigidBody::upcast(obj); + TrbState& state = m_memoryCache->m_mystates[i]; + + state.reset(); + const btQuaternion& orgOri = obj->getWorldTransform().getRotation(); + vmQuat orn(orgOri.getX(),orgOri.getY(),orgOri.getZ(),orgOri.getW()); + state.setPosition((vmVector3&) obj->getWorldTransform().getOrigin()); + state.setOrientation(orn); + state.setPosition(state.getPosition()); + state.setRigidBodyId(i); + state.setAngularDamping(0); + state.setLinearDamping(0); + + + solverBody.mOrientation = state.getOrientation(); + solverBody.mDeltaLinearVelocity = vmVector3(0.0f); + solverBody.mDeltaAngularVelocity = vmVector3(0.0f); + solverBody.friction = obj->getFriction(); + solverBody.restitution = obj->getRestitution(); + + state.resetSleepCount(); + + //if(state.getMotionMask()&PFX_MOTION_MASK_DYNAMIC) { + if (rb && (rb->getInvMass()>0.f)) { - m_orderTmpConstraintPool[i] = i; + state.setAngularVelocity(vmVector3(rb->getAngularVelocity().getX(),rb->getAngularVelocity().getY(),rb->getAngularVelocity().getZ())); + state.setLinearVelocity(vmVector3(rb->getLinearVelocity().getX(),rb->getLinearVelocity().getY(),rb->getLinearVelocity().getZ())); + + state.setMotionType(PfxMotionTypeActive); + vmMatrix3 ori(solverBody.mOrientation); + vmMatrix3 localInvInertia = vmMatrix3::identity(); + localInvInertia.setCol(0,vmVector3(rb->getInvInertiaDiagLocal().getX(),0,0)); + localInvInertia.setCol(1,vmVector3(0, rb->getInvInertiaDiagLocal().getY(),0)); + localInvInertia.setCol(2,vmVector3(0,0, rb->getInvInertiaDiagLocal().getZ())); + + solverBody.mMassInv = rb->getInvMass(); + solverBody.mInertiaInv = ori * localInvInertia * transpose(ori); + } else + { + state.setAngularVelocity(vmVector3(0)); + state.setLinearVelocity(vmVector3(0)); + + state.setMotionType(PfxMotionTypeFixed); + m_memoryCache->m_mysolverbodies[i].mMassInv = 0.f; + m_memoryCache->m_mysolverbodies[i].mInertiaInv = vmMatrix3(0.0f); } - for (i=0;im_mypairs.resize(numManifolds); + m_memoryCache->m_jointPairs.resize(numConstraints); +#endif//USE_C_ARRAYS + + int actualNumManifolds= 0; + { + BT_PROFILE("convert manifolds"); + for (int i1=0;i1getNumContacts()>0) + { + btPersistentManifold* m = manifoldPtr[i1]; + btCollisionObject* obA = (btCollisionObject*)m->getBody0(); + btCollisionObject* obB = (btCollisionObject*)m->getBody1(); + bool obAisActive = !obA->isStaticOrKinematicObject() && obA->isActive(); + bool obBisActive = !obB->isStaticOrKinematicObject() && obB->isActive(); + + if (!obAisActive && !obBisActive) + continue; + + + //int contactId = i1;//actualNumManifolds; + + PfxBroadphasePair& pair = m_memoryCache->m_mypairs[actualNumManifolds]; + //init those + float compFric = obA->getFriction()*obB->getFriction();//@todo + int idA = obA->getCompanionId(); + int idB = obB->getCompanionId(); + + m->m_companionIdA = idA; + m->m_companionIdB = idB; + + + // if ((mysolverbodies[idA].mMassInv!=0)&&(mysolverbodies[idB].mMassInv!=0)) + // continue; + int numPosPoints=0; + for (int p=0;pgetNumContacts();p++) + { + btManifoldPoint& pt = m->getContactPoint(p); + float dist = pt.getDistance(); + //if (dist<0.001) + numPosPoints++; + } + + + numPosPoints = numPosPoints; + totalPoints+=numPosPoints; + pfxSetRigidBodyIdA(pair,idA); + pfxSetRigidBodyIdB(pair,idB); + pfxSetMotionMaskA(pair,m_memoryCache->m_mystates[idA].getMotionMask()); + pfxSetMotionMaskB(pair,m_memoryCache->m_mystates[idB].getMotionMask()); + pfxSetActive(pair,numPosPoints>0); + + pfxSetBroadphaseFlag(pair,0); + pfxSetContactId(pair,(uint64_t)m);//contactId); + pfxSetNumConstraints(pair,numPosPoints);//manifoldPtr[i]->getNumContacts()); + actualNumManifolds++; + } + } } + PfxConstraintPair* jointPairs=0; + jointPairs = numConstraints? &m_memoryCache->m_jointPairs[0]:0; + int actualNumJoints=0; + + + //if (1) + { + BT_PROFILE("convert constraints"); + { + + int totalNumRows = 0; + int i; + + m_tmpConstraintSizesPool.resize(numConstraints); + //calculate the total number of contraint rows + for (i=0;igetInfo1(&info1); + totalNumRows += info1.m_numConstraintRows; + } + m_tmpSolverNonContactConstraintPool.resize(totalNumRows); + + + ///setup the btSolverConstraints + int currentRow = 0; + + for (i=0;igetRigidBodyA(); + btRigidBody& rbB = constraint->getRigidBodyB(); + + + int j; + for ( j=0;jm_contactNormal; + info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal; + info2.m_J2linearAxis = 0; + info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal; + info2.rowskip = sizeof(btSolverConstraint)/sizeof(btScalar);//check this + ///the size of btSolverConstraint needs be a multiple of btScalar + btAssert(info2.rowskip*sizeof(btScalar)== sizeof(btSolverConstraint)); + info2.m_constraintError = ¤tConstraintRow->m_rhs; + currentConstraintRow->m_cfm = infoGlobal.m_globalCfm; + info2.cfm = ¤tConstraintRow->m_cfm; + info2.m_lowerLimit = ¤tConstraintRow->m_lowerLimit; + info2.m_upperLimit = ¤tConstraintRow->m_upperLimit; + info2.m_numIterations = infoGlobal.m_numIterations; + constraints[i]->getInfo2(&info2); + + int idA = constraint->getRigidBodyA().getCompanionId(); + int idB = constraint->getRigidBodyB().getCompanionId(); + + + ///finalize the constraint setup + for ( j=0;jgetRigidBodyA().getInvInertiaTensorWorld()*ftorqueAxis1*constraint->getRigidBodyA().getAngularFactor(); + } + { + const btVector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal; + solverConstraint.m_angularComponentB = constraint->getRigidBodyB().getInvInertiaTensorWorld()*ftorqueAxis2*constraint->getRigidBodyB().getAngularFactor(); + } + + { + btVector3 iMJlA = solverConstraint.m_contactNormal*rbA.getInvMass(); + btVector3 iMJaA = rbA.getInvInertiaTensorWorld()*solverConstraint.m_relpos1CrossNormal; + btVector3 iMJlB = solverConstraint.m_contactNormal*rbB.getInvMass();//sign of normal? + btVector3 iMJaB = rbB.getInvInertiaTensorWorld()*solverConstraint.m_relpos2CrossNormal; + + btScalar sum = iMJlA.dot(solverConstraint.m_contactNormal); + sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal); + sum += iMJlB.dot(solverConstraint.m_contactNormal); + sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal); + + solverConstraint.m_jacDiagABInv = btScalar(1.)/sum; + } + + + ///fix rhs + ///todo: add force/torque accelerators + { + btScalar rel_vel; + btScalar vel1Dotn = solverConstraint.m_contactNormal.dot(rbA.getLinearVelocity()) + solverConstraint.m_relpos1CrossNormal.dot(rbA.getAngularVelocity()); + btScalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rbB.getLinearVelocity()) + solverConstraint.m_relpos2CrossNormal.dot(rbB.getAngularVelocity()); + + rel_vel = vel1Dotn+vel2Dotn; + + btScalar restitution = 0.f; + btScalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2 + btScalar velocityError = restitution - rel_vel;// * damping; + btScalar penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv; + btScalar velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv; + solverConstraint.m_rhs = penetrationImpulse+velocityImpulse; + solverConstraint.m_appliedImpulse = 0.f; + + } + } + + PfxConstraintPair& pair = jointPairs[actualNumJoints]; + + int numConstraintRows= info1.m_numConstraintRows; + pfxSetNumConstraints(pair,numConstraintRows); + + + + pfxSetRigidBodyIdA(pair,idA); + pfxSetRigidBodyIdB(pair,idB); + //is this needed? + pfxSetMotionMaskA(pair,m_memoryCache->m_mystates[idA].getMotionMask()); + pfxSetMotionMaskB(pair,m_memoryCache->m_mystates[idB].getMotionMask()); + + pfxSetActive(pair,true); + pfxSetContactId(pair,(uint64_t)currentConstraintRow);//contactId); + actualNumJoints++; + + + } + currentRow+=m_tmpConstraintSizesPool[i].m_numConstraintRows; + } + } + } + + + + float separateBias=0.1;//info.m_erp;//or m_erp2? + float timeStep=infoGlobal.m_timeStep; + int iteration=infoGlobal.m_numIterations; + + //create a pair for each constraints, copy over info etc + + + + + + { + BT_PROFILE("compute num contacts"); + int totalContacts =0; + + for (int i=0;im_mypairs[i]; + totalContacts += pfxGetNumConstraints(*pair); + } + //printf("numManifolds = %d\n",numManifolds); + //printf("totalContacts=%d\n",totalContacts); + } + + + +// printf("actualNumManifolds=%d\n",actualNumManifolds); + { + BT_PROFILE("BPE_customConstraintSolverSequentialNew"); + if (numRigidBodies>0 && (actualNumManifolds+actualNumJoints)>0) + { +// PFX_PRINTF("num points = %d\n",totalPoints); +// PFX_PRINTF("num points PFX = %d\n",total); + + BPE_customConstraintSolverSequentialNew( + actualNumManifolds, + &m_memoryCache->m_mypairs[0], + &m_memoryCache->m_mystates[0],numRigidBodies, + &m_memoryCache->m_mysolverbodies[0], + 0,//manifoldArray, + jointPairs,actualNumJoints,separateBias,timeStep,iteration, + m_solverThreadSupport,m_criticalSection,m_solverIO,m_barrier); + } + } + + //copy results back to bodies + { + BT_PROFILE("copy back"); + for (int i=0;im_mystates[i]; + if (rb && (rb->getInvMass()>0.f)) + { + rb->setLinearVelocity(btVector3(state.getLinearVelocity().getX(),state.getLinearVelocity().getY(),state.getLinearVelocity().getZ())); + rb->setAngularVelocity(btVector3(state.getAngularVelocity().getX(),state.getAngularVelocity().getY(),state.getAngularVelocity().getZ())); + } + } + } + + return 0.f; } - diff --git a/src/BulletMultiThreaded/btParallelConstraintSolver.h b/src/BulletMultiThreaded/btParallelConstraintSolver.h index c347f96f5..2e8fbaf13 100644 --- a/src/BulletMultiThreaded/btParallelConstraintSolver.h +++ b/src/BulletMultiThreaded/btParallelConstraintSolver.h @@ -18,22 +18,238 @@ subject to the following restrictions: #define __BT_PARALLEL_CONSTRAINT_SOLVER_H #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h" - + + + + +#include "LinearMath/btScalar.h" +#include "PlatformDefinitions.h" + + +#define PFX_MAX_SOLVER_PHASES 64 +#define PFX_MAX_SOLVER_BATCHES 16 +#define PFX_MAX_SOLVER_PAIRS 128 +#define PFX_MIN_SOLVER_PAIRS 16 + +#ifdef __CELLOS_LV2__ +ATTRIBUTE_ALIGNED128(struct) PfxParallelBatch { +#else +ATTRIBUTE_ALIGNED16(struct) PfxParallelBatch { +#endif + uint16_t pairIndices[PFX_MAX_SOLVER_PAIRS]; +}; + +#ifdef __CELLOS_LV2__ +ATTRIBUTE_ALIGNED128(struct) PfxParallelGroup { +#else +ATTRIBUTE_ALIGNED16(struct) PfxParallelGroup { +#endif + uint16_t numPhases; + uint16_t numBatches[PFX_MAX_SOLVER_PHASES]; + uint16_t numPairs[PFX_MAX_SOLVER_PHASES*PFX_MAX_SOLVER_BATCHES]; +}; + + + +ATTRIBUTE_ALIGNED16(struct) PfxSortData16 { + union { + uint8_t i8data[16]; + uint16_t i16data[8]; + uint32_t i32data[4]; +#ifdef __SPU__ + vec_uint4 vdata; +#endif + }; + +#ifdef __SPU__ + void set8(int elem,uint8_t data) {vdata=(vec_uint4)spu_insert(data,(vec_uchar16)vdata,elem);} + void set16(int elem,uint16_t data) {vdata=(vec_uint4)spu_insert(data,(vec_ushort8)vdata,elem);} + void set32(int elem,uint32_t data) {vdata=(vec_uint4)spu_insert(data,(vec_uint4)vdata,elem);} + uint8_t get8(int elem) const {return spu_extract((vec_uchar16)vdata,elem);} + uint16_t get16(int elem) const {return spu_extract((vec_ushort8)vdata,elem);} + uint32_t get32(int elem) const {return spu_extract((vec_uint4)vdata,elem);} +#else + void set8(int elem,uint8_t data) {i8data[elem] = data;} + void set16(int elem,uint16_t data) {i16data[elem] = data;} + void set32(int elem,uint32_t data) {i32data[elem] = data;} + uint8_t get8(int elem) const {return i8data[elem];} + uint16_t get16(int elem) const {return i16data[elem];} + uint32_t get32(int elem) const {return i32data[elem];} +#endif +}; + +typedef PfxSortData16 PfxConstraintPair; + + +//J PfxBroadphasePair‚Æ‹¤’Ê + +SIMD_FORCE_INLINE void pfxSetConstraintId(PfxConstraintPair &pair,uint32_t i) {pair.set32(2,i);} +SIMD_FORCE_INLINE void pfxSetNumConstraints(PfxConstraintPair &pair,uint8_t n) {pair.set8(7,n);} + +SIMD_FORCE_INLINE uint32_t pfxGetConstraintId1(const PfxConstraintPair &pair) {return pair.get32(2);} +SIMD_FORCE_INLINE uint8_t pfxGetNumConstraints(const PfxConstraintPair &pair) {return pair.get8(7);} + +typedef PfxSortData16 PfxBroadphasePair; + +SIMD_FORCE_INLINE void pfxSetRigidBodyIdA(PfxBroadphasePair &pair,uint16_t i) {pair.set16(0,i);} +SIMD_FORCE_INLINE void pfxSetRigidBodyIdB(PfxBroadphasePair &pair,uint16_t i) {pair.set16(1,i);} +SIMD_FORCE_INLINE void pfxSetMotionMaskA(PfxBroadphasePair &pair,uint8_t i) {pair.set8(4,i);} +SIMD_FORCE_INLINE void pfxSetMotionMaskB(PfxBroadphasePair &pair,uint8_t i) {pair.set8(5,i);} +SIMD_FORCE_INLINE void pfxSetBroadphaseFlag(PfxBroadphasePair &pair,uint8_t f) {pair.set8(6,(pair.get8(6)&0xf0)|(f&0x0f));} +SIMD_FORCE_INLINE void pfxSetActive(PfxBroadphasePair &pair,bool b) {pair.set8(6,(pair.get8(6)&0x0f)|((b?1:0)<<4));} +SIMD_FORCE_INLINE void pfxSetContactId(PfxBroadphasePair &pair,uint32_t i) {pair.set32(2,i);} + +SIMD_FORCE_INLINE uint16_t pfxGetRigidBodyIdA(const PfxBroadphasePair &pair) {return pair.get16(0);} +SIMD_FORCE_INLINE uint16_t pfxGetRigidBodyIdB(const PfxBroadphasePair &pair) {return pair.get16(1);} +SIMD_FORCE_INLINE uint8_t pfxGetMotionMaskA(const PfxBroadphasePair &pair) {return pair.get8(4);} +SIMD_FORCE_INLINE uint8_t pfxGetMotionMaskB(const PfxBroadphasePair &pair) {return pair.get8(5);} +SIMD_FORCE_INLINE uint8_t pfxGetBroadphaseFlag(const PfxBroadphasePair &pair) {return pair.get8(6)&0x0f;} +SIMD_FORCE_INLINE bool pfxGetActive(const PfxBroadphasePair &pair) {return (pair.get8(6)>>4)!=0;} +SIMD_FORCE_INLINE uint32_t pfxGetContactId1(const PfxBroadphasePair &pair) {return pair.get32(2);} + + + +#if defined(__PPU__) || defined (__SPU__) +ATTRIBUTE_ALIGNED128(struct) PfxSolverBody { +#else +ATTRIBUTE_ALIGNED16(struct) PfxSolverBody { +#endif + vmVector3 mDeltaLinearVelocity; + vmVector3 mDeltaAngularVelocity; + vmMatrix3 mInertiaInv; + vmQuat mOrientation; + float mMassInv; + float friction; + float restitution; + float unused; + float unused2; + float unused3; + float unused4; + float unused5; +}; + + +#ifdef __PPU__ +#include "SpuDispatch/BulletPE2ConstraintSolverSpursSupport.h" +#endif + + + +static SIMD_FORCE_INLINE vmVector3 btReadVector3(const float* p) +{ + vmVector3 v; + loadXYZ(v, p); + return v; +} + +static SIMD_FORCE_INLINE vmQuat btReadQuat(const float* p) +{ + vmQuat vq; + loadXYZW(vq, p); + return vq; +} + +static SIMD_FORCE_INLINE void btStoreVector3(const vmVector3 &src, float* p) +{ + vmVector3 v = src; + storeXYZ(v, p); +} + + + + +class btPersistentManifold; + +enum { + PFX_CONSTRAINT_SOLVER_CMD_SETUP_SOLVER_BODIES, + PFX_CONSTRAINT_SOLVER_CMD_SETUP_CONTACT_CONSTRAINTS, + PFX_CONSTRAINT_SOLVER_CMD_SETUP_JOINT_CONSTRAINTS, + PFX_CONSTRAINT_SOLVER_CMD_SOLVE_CONSTRAINTS, + PFX_CONSTRAINT_SOLVER_CMD_POST_SOLVER +}; + + +struct PfxSetupContactConstraintsIO { + PfxConstraintPair *offsetContactPairs; + uint32_t numContactPairs1; + class TrbState *offsetRigStates; + struct PfxSolverBody *offsetSolverBodies; + uint32_t numRigidBodies; + float separateBias; + float timeStep; + class btCriticalSection* criticalSection; +}; + + + +struct PfxSolveConstraintsIO { + PfxParallelGroup *contactParallelGroup; + PfxParallelBatch *contactParallelBatches; + PfxConstraintPair *contactPairs; + uint32_t numContactPairs; + btPersistentManifold *offsetContactManifolds; + PfxParallelGroup *jointParallelGroup; + PfxParallelBatch *jointParallelBatches; + PfxConstraintPair *jointPairs; + uint32_t numJointPairs; + TrbState *offsetRigStates; + PfxSolverBody *offsetSolverBodies; + uint32_t numRigidBodies; + uint32_t iteration; + + uint32_t taskId; + + class btBarrier* barrier; + +}; + +struct PfxPostSolverIO { + TrbState *states; + PfxSolverBody *solverBodies; + uint32_t numRigidBodies; +}; + +ATTRIBUTE_ALIGNED16(struct) btConstraintSolverIO { + uint8_t cmd; + union { + PfxSetupContactConstraintsIO setupContactConstraints; + PfxSolveConstraintsIO solveConstraints; + PfxPostSolverIO postSolver; + }; + + //SPU only + uint32_t barrierAddr2; + uint32_t criticalsectionAddr2; + uint32_t maxTasks1; +}; + + + + +void SolverThreadFunc(void* userPtr,void* lsMemory); +void* SolverlsMemoryFunc(); +///The btParallelConstraintSolver performs computations on constraint rows in parallel +///Using the cross-platform threading it supports Windows, Linux, Mac OSX and PlayStation 3 Cell SPUs class btParallelConstraintSolver : public btSequentialImpulseConstraintSolver { + protected: + struct btParallelSolverMemoryCache* m_memoryCache; + + class btThreadSupportInterface* m_solverThreadSupport; + + struct btConstraintSolverIO* m_solverIO; + class btBarrier* m_barrier; + class btCriticalSection* m_criticalSection; + public: - btParallelConstraintSolver(); + btParallelConstraintSolver(class btThreadSupportInterface* solverThreadSupport); virtual ~btParallelConstraintSolver(); - //virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc,btDispatcher* dispatcher); - - btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btStackAlloc* stackAlloc); - - + virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc,btDispatcher* dispatcher); }; diff --git a/src/BulletMultiThreaded/vectormath/scalar/cpp/boolInVec.h b/src/BulletMultiThreaded/vectormath/scalar/boolInVec.h similarity index 100% rename from src/BulletMultiThreaded/vectormath/scalar/cpp/boolInVec.h rename to src/BulletMultiThreaded/vectormath/scalar/boolInVec.h diff --git a/src/BulletMultiThreaded/vectormath/scalar/cpp/floatInVec.h b/src/BulletMultiThreaded/vectormath/scalar/floatInVec.h similarity index 100% rename from src/BulletMultiThreaded/vectormath/scalar/cpp/floatInVec.h rename to src/BulletMultiThreaded/vectormath/scalar/floatInVec.h diff --git a/src/BulletMultiThreaded/vectormath/scalar/cpp/mat_aos.h b/src/BulletMultiThreaded/vectormath/scalar/mat_aos.h similarity index 100% rename from src/BulletMultiThreaded/vectormath/scalar/cpp/mat_aos.h rename to src/BulletMultiThreaded/vectormath/scalar/mat_aos.h diff --git a/src/BulletMultiThreaded/vectormath/scalar/cpp/quat_aos.h b/src/BulletMultiThreaded/vectormath/scalar/quat_aos.h similarity index 100% rename from src/BulletMultiThreaded/vectormath/scalar/cpp/quat_aos.h rename to src/BulletMultiThreaded/vectormath/scalar/quat_aos.h diff --git a/src/BulletMultiThreaded/vectormath/scalar/cpp/vec_aos.h b/src/BulletMultiThreaded/vectormath/scalar/vec_aos.h similarity index 100% rename from src/BulletMultiThreaded/vectormath/scalar/cpp/vec_aos.h rename to src/BulletMultiThreaded/vectormath/scalar/vec_aos.h diff --git a/src/BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h b/src/BulletMultiThreaded/vectormath/scalar/vectormath_aos.h similarity index 100% rename from src/BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h rename to src/BulletMultiThreaded/vectormath/scalar/vectormath_aos.h diff --git a/src/BulletMultiThreaded/vectormath/sse/boolInVec.h b/src/BulletMultiThreaded/vectormath/sse/boolInVec.h new file mode 100644 index 000000000..d18cb15ce --- /dev/null +++ b/src/BulletMultiThreaded/vectormath/sse/boolInVec.h @@ -0,0 +1,247 @@ +/* + Copyright (C) 2006, 2007 Sony Computer Entertainment Inc. + All rights reserved. + + Redistribution and use in source and binary forms, + with or without modification, are permitted provided that the + following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the Sony Computer Entertainment Inc nor the names + of its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _BOOLINVEC_H +#define _BOOLINVEC_H + +#include + +namespace Vectormath { + +class floatInVec; + +//-------------------------------------------------------------------------------------------------- +// boolInVec class +// + +class boolInVec +{ + private: + __m128 mData; + + inline boolInVec(__m128 vec); + public: + inline boolInVec() {} + + // matches standard type conversions + // + inline boolInVec(const floatInVec &vec); + + // explicit cast from bool + // + explicit inline boolInVec(bool scalar); + +#ifdef _VECTORMATH_NO_SCALAR_CAST + // explicit cast to bool + // + inline bool getAsBool() const; +#else + // implicit cast to bool + // + inline operator bool() const; +#endif + + // get vector data + // bool value is splatted across all word slots of vector as 0 (false) or -1 (true) + // + inline __m128 get128() const; + + // operators + // + inline const boolInVec operator ! () const; + inline boolInVec& operator = (const boolInVec &vec); + inline boolInVec& operator &= (const boolInVec &vec); + inline boolInVec& operator ^= (const boolInVec &vec); + inline boolInVec& operator |= (const boolInVec &vec); + + // friend functions + // + friend inline const boolInVec operator == (const boolInVec &vec0, const boolInVec &vec1); + friend inline const boolInVec operator != (const boolInVec &vec0, const boolInVec &vec1); + friend inline const boolInVec operator < (const floatInVec &vec0, const floatInVec &vec1); + friend inline const boolInVec operator <= (const floatInVec &vec0, const floatInVec &vec1); + friend inline const boolInVec operator > (const floatInVec &vec0, const floatInVec &vec1); + friend inline const boolInVec operator >= (const floatInVec &vec0, const floatInVec &vec1); + friend inline const boolInVec operator == (const floatInVec &vec0, const floatInVec &vec1); + friend inline const boolInVec operator != (const floatInVec &vec0, const floatInVec &vec1); + friend inline const boolInVec operator & (const boolInVec &vec0, const boolInVec &vec1); + friend inline const boolInVec operator ^ (const boolInVec &vec0, const boolInVec &vec1); + friend inline const boolInVec operator | (const boolInVec &vec0, const boolInVec &vec1); + friend inline const boolInVec select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1); +}; + +//-------------------------------------------------------------------------------------------------- +// boolInVec functions +// + +// operators +// +inline const boolInVec operator == (const boolInVec &vec0, const boolInVec &vec1); +inline const boolInVec operator != (const boolInVec &vec0, const boolInVec &vec1); +inline const boolInVec operator & (const boolInVec &vec0, const boolInVec &vec1); +inline const boolInVec operator ^ (const boolInVec &vec0, const boolInVec &vec1); +inline const boolInVec operator | (const boolInVec &vec0, const boolInVec &vec1); + +// select between vec0 and vec1 using boolInVec. +// false selects vec0, true selects vec1 +// +inline const boolInVec select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1); + +} // namespace Vectormath + +//-------------------------------------------------------------------------------------------------- +// boolInVec implementation +// + +#include "floatInVec.h" + +namespace Vectormath { + +inline +boolInVec::boolInVec(__m128 vec) +{ + mData = vec; +} + +inline +boolInVec::boolInVec(const floatInVec &vec) +{ + *this = (vec != floatInVec(0.0f)); +} + +inline +boolInVec::boolInVec(bool scalar) +{ + unsigned int mask = -(int)scalar; + mData = _mm_set1_ps(*(float *)&mask); // TODO: Union +} + +#ifdef _VECTORMATH_NO_SCALAR_CAST +inline +bool +boolInVec::getAsBool() const +#else +inline +boolInVec::operator bool() const +#endif +{ + return *(bool *)&mData; +} + +inline +__m128 +boolInVec::get128() const +{ + return mData; +} + +inline +const boolInVec +boolInVec::operator ! () const +{ + return boolInVec(_mm_andnot_ps(mData, _mm_cmpneq_ps(_mm_setzero_ps(),_mm_setzero_ps()))); +} + +inline +boolInVec& +boolInVec::operator = (const boolInVec &vec) +{ + mData = vec.mData; + return *this; +} + +inline +boolInVec& +boolInVec::operator &= (const boolInVec &vec) +{ + *this = *this & vec; + return *this; +} + +inline +boolInVec& +boolInVec::operator ^= (const boolInVec &vec) +{ + *this = *this ^ vec; + return *this; +} + +inline +boolInVec& +boolInVec::operator |= (const boolInVec &vec) +{ + *this = *this | vec; + return *this; +} + +inline +const boolInVec +operator == (const boolInVec &vec0, const boolInVec &vec1) +{ + return boolInVec(_mm_cmpeq_ps(vec0.get128(), vec1.get128())); +} + +inline +const boolInVec +operator != (const boolInVec &vec0, const boolInVec &vec1) +{ + return boolInVec(_mm_cmpneq_ps(vec0.get128(), vec1.get128())); +} + +inline +const boolInVec +operator & (const boolInVec &vec0, const boolInVec &vec1) +{ + return boolInVec(_mm_and_ps(vec0.get128(), vec1.get128())); +} + +inline +const boolInVec +operator | (const boolInVec &vec0, const boolInVec &vec1) +{ + return boolInVec(_mm_or_ps(vec0.get128(), vec1.get128())); +} + +inline +const boolInVec +operator ^ (const boolInVec &vec0, const boolInVec &vec1) +{ + return boolInVec(_mm_xor_ps(vec0.get128(), vec1.get128())); +} + +inline +const boolInVec +select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1) +{ + return boolInVec(vec_sel(vec0.get128(), vec1.get128(), select_vec1.get128())); +} + +} // namespace Vectormath + +#endif // boolInVec_h diff --git a/src/BulletMultiThreaded/vectormath/sse/floatInVec.h b/src/BulletMultiThreaded/vectormath/sse/floatInVec.h new file mode 100644 index 000000000..6443865b1 --- /dev/null +++ b/src/BulletMultiThreaded/vectormath/sse/floatInVec.h @@ -0,0 +1,340 @@ +/* + Copyright (C) 2006, 2007 Sony Computer Entertainment Inc. + All rights reserved. + + Redistribution and use in source and binary forms, + with or without modification, are permitted provided that the + following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the Sony Computer Entertainment Inc nor the names + of its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _FLOATINVEC_H +#define _FLOATINVEC_H + +#include +#include + +namespace Vectormath { + +class boolInVec; + +//-------------------------------------------------------------------------------------------------- +// floatInVec class +// + +class floatInVec +{ + private: + __m128 mData; + + public: + inline floatInVec(__m128 vec); + + inline floatInVec() {} + + // matches standard type conversions + // + inline floatInVec(const boolInVec &vec); + + // construct from a slot of __m128 + // + inline floatInVec(__m128 vec, int slot); + + // explicit cast from float + // + explicit inline floatInVec(float scalar); + +#ifdef _VECTORMATH_NO_SCALAR_CAST + // explicit cast to float + // + inline float getAsFloat() const; +#else + // implicit cast to float + // + inline operator float() const; +#endif + + // get vector data + // float value is splatted across all word slots of vector + // + inline __m128 get128() const; + + // operators + // + inline const floatInVec operator ++ (int); + inline const floatInVec operator -- (int); + inline floatInVec& operator ++ (); + inline floatInVec& operator -- (); + inline const floatInVec operator - () const; + inline floatInVec& operator = (const floatInVec &vec); + inline floatInVec& operator *= (const floatInVec &vec); + inline floatInVec& operator /= (const floatInVec &vec); + inline floatInVec& operator += (const floatInVec &vec); + inline floatInVec& operator -= (const floatInVec &vec); + + // friend functions + // + friend inline const floatInVec operator * (const floatInVec &vec0, const floatInVec &vec1); + friend inline const floatInVec operator / (const floatInVec &vec0, const floatInVec &vec1); + friend inline const floatInVec operator + (const floatInVec &vec0, const floatInVec &vec1); + friend inline const floatInVec operator - (const floatInVec &vec0, const floatInVec &vec1); + friend inline const floatInVec select(const floatInVec &vec0, const floatInVec &vec1, boolInVec select_vec1); +}; + +//-------------------------------------------------------------------------------------------------- +// floatInVec functions +// + +// operators +// +inline const floatInVec operator * (const floatInVec &vec0, const floatInVec &vec1); +inline const floatInVec operator / (const floatInVec &vec0, const floatInVec &vec1); +inline const floatInVec operator + (const floatInVec &vec0, const floatInVec &vec1); +inline const floatInVec operator - (const floatInVec &vec0, const floatInVec &vec1); +inline const boolInVec operator < (const floatInVec &vec0, const floatInVec &vec1); +inline const boolInVec operator <= (const floatInVec &vec0, const floatInVec &vec1); +inline const boolInVec operator > (const floatInVec &vec0, const floatInVec &vec1); +inline const boolInVec operator >= (const floatInVec &vec0, const floatInVec &vec1); +inline const boolInVec operator == (const floatInVec &vec0, const floatInVec &vec1); +inline const boolInVec operator != (const floatInVec &vec0, const floatInVec &vec1); + +// select between vec0 and vec1 using boolInVec. +// false selects vec0, true selects vec1 +// +inline const floatInVec select(const floatInVec &vec0, const floatInVec &vec1, const boolInVec &select_vec1); + +} // namespace Vectormath + +//-------------------------------------------------------------------------------------------------- +// floatInVec implementation +// + +#include "boolInVec.h" + +namespace Vectormath { + +inline +floatInVec::floatInVec(__m128 vec) +{ + mData = vec; +} + +inline +floatInVec::floatInVec(const boolInVec &vec) +{ + mData = vec_sel(_mm_setzero_ps(), _mm_set1_ps(1.0f), vec.get128()); +} + +inline +floatInVec::floatInVec(__m128 vec, int slot) +{ + SSEFloat v; + v.m128 = vec; + mData = _mm_set1_ps(v.f[slot]); +} + +inline +floatInVec::floatInVec(float scalar) +{ + mData = _mm_set1_ps(scalar); +} + +#ifdef _VECTORMATH_NO_SCALAR_CAST +inline +float +floatInVec::getAsFloat() const +#else +inline +floatInVec::operator float() const +#endif +{ + return *((float *)&mData); +} + +inline +__m128 +floatInVec::get128() const +{ + return mData; +} + +inline +const floatInVec +floatInVec::operator ++ (int) +{ + __m128 olddata = mData; + operator ++(); + return floatInVec(olddata); +} + +inline +const floatInVec +floatInVec::operator -- (int) +{ + __m128 olddata = mData; + operator --(); + return floatInVec(olddata); +} + +inline +floatInVec& +floatInVec::operator ++ () +{ + *this += floatInVec(_mm_set1_ps(1.0f)); + return *this; +} + +inline +floatInVec& +floatInVec::operator -- () +{ + *this -= floatInVec(_mm_set1_ps(1.0f)); + return *this; +} + +inline +const floatInVec +floatInVec::operator - () const +{ + return floatInVec(_mm_sub_ps(_mm_setzero_ps(), mData)); +} + +inline +floatInVec& +floatInVec::operator = (const floatInVec &vec) +{ + mData = vec.mData; + return *this; +} + +inline +floatInVec& +floatInVec::operator *= (const floatInVec &vec) +{ + *this = *this * vec; + return *this; +} + +inline +floatInVec& +floatInVec::operator /= (const floatInVec &vec) +{ + *this = *this / vec; + return *this; +} + +inline +floatInVec& +floatInVec::operator += (const floatInVec &vec) +{ + *this = *this + vec; + return *this; +} + +inline +floatInVec& +floatInVec::operator -= (const floatInVec &vec) +{ + *this = *this - vec; + return *this; +} + +inline +const floatInVec +operator * (const floatInVec &vec0, const floatInVec &vec1) +{ + return floatInVec(_mm_mul_ps(vec0.get128(), vec1.get128())); +} + +inline +const floatInVec +operator / (const floatInVec &num, const floatInVec &den) +{ + return floatInVec(_mm_div_ps(num.get128(), den.get128())); +} + +inline +const floatInVec +operator + (const floatInVec &vec0, const floatInVec &vec1) +{ + return floatInVec(_mm_add_ps(vec0.get128(), vec1.get128())); +} + +inline +const floatInVec +operator - (const floatInVec &vec0, const floatInVec &vec1) +{ + return floatInVec(_mm_sub_ps(vec0.get128(), vec1.get128())); +} + +inline +const boolInVec +operator < (const floatInVec &vec0, const floatInVec &vec1) +{ + return boolInVec(_mm_cmpgt_ps(vec1.get128(), vec0.get128())); +} + +inline +const boolInVec +operator <= (const floatInVec &vec0, const floatInVec &vec1) +{ + return boolInVec(_mm_cmpge_ps(vec1.get128(), vec0.get128())); +} + +inline +const boolInVec +operator > (const floatInVec &vec0, const floatInVec &vec1) +{ + return boolInVec(_mm_cmpgt_ps(vec0.get128(), vec1.get128())); +} + +inline +const boolInVec +operator >= (const floatInVec &vec0, const floatInVec &vec1) +{ + return boolInVec(_mm_cmpge_ps(vec0.get128(), vec1.get128())); +} + +inline +const boolInVec +operator == (const floatInVec &vec0, const floatInVec &vec1) +{ + return boolInVec(_mm_cmpeq_ps(vec0.get128(), vec1.get128())); +} + +inline +const boolInVec +operator != (const floatInVec &vec0, const floatInVec &vec1) +{ + return boolInVec(_mm_cmpneq_ps(vec0.get128(), vec1.get128())); +} + +inline +const floatInVec +select(const floatInVec &vec0, const floatInVec &vec1, const boolInVec &select_vec1) +{ + return floatInVec(vec_sel(vec0.get128(), vec1.get128(), select_vec1.get128())); +} + +} // namespace Vectormath + +#endif // floatInVec_h diff --git a/src/BulletMultiThreaded/vectormath/sse/mat_aos.h b/src/BulletMultiThreaded/vectormath/sse/mat_aos.h new file mode 100644 index 000000000..2ba27bf2a --- /dev/null +++ b/src/BulletMultiThreaded/vectormath/sse/mat_aos.h @@ -0,0 +1,2190 @@ +/* + Copyright (C) 2006, 2007 Sony Computer Entertainment Inc. + All rights reserved. + + Redistribution and use in source and binary forms, + with or without modification, are permitted provided that the + following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the Sony Computer Entertainment Inc nor the names + of its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + + +#ifndef _VECTORMATH_MAT_AOS_CPP_H +#define _VECTORMATH_MAT_AOS_CPP_H + +namespace Vectormath { +namespace Aos { + +//----------------------------------------------------------------------------- +// Constants +// for shuffles, words are labeled [x,y,z,w] [a,b,c,d] + +#define _VECTORMATH_PERM_ZBWX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_B, _VECTORMATH_PERM_W, _VECTORMATH_PERM_X }) +#define _VECTORMATH_PERM_XCYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_C, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X }) +#define _VECTORMATH_PERM_XYAB ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B }) +#define _VECTORMATH_PERM_ZWCD ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_W, _VECTORMATH_PERM_C, _VECTORMATH_PERM_D }) +#define _VECTORMATH_PERM_XZBX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_B, _VECTORMATH_PERM_X }) +#define _VECTORMATH_PERM_CXXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X }) +#define _VECTORMATH_PERM_YAXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X }) +#define _VECTORMATH_PERM_XAZC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_C }) +#define _VECTORMATH_PERM_YXWZ ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X, _VECTORMATH_PERM_W, _VECTORMATH_PERM_Z }) +#define _VECTORMATH_PERM_YBWD ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_B, _VECTORMATH_PERM_W, _VECTORMATH_PERM_D }) +#define _VECTORMATH_PERM_XYCX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C, _VECTORMATH_PERM_X }) +#define _VECTORMATH_PERM_YCXY ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y }) +#define _VECTORMATH_PERM_CXYC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C }) +#define _VECTORMATH_PERM_ZAYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X }) +#define _VECTORMATH_PERM_BZXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_B, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X }) +#define _VECTORMATH_PERM_XZYA ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A }) +#define _VECTORMATH_PERM_ZXXB ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_B }) +#define _VECTORMATH_PERM_YXXC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_C }) +#define _VECTORMATH_PERM_BBYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_B, _VECTORMATH_PERM_B, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X }) +#define _VECTORMATH_PI_OVER_2 1.570796327f + +//----------------------------------------------------------------------------- +// Definitions + +__forceinline Matrix3::Matrix3( const Matrix3 & mat ) +{ + mCol0 = mat.mCol0; + mCol1 = mat.mCol1; + mCol2 = mat.mCol2; +} + +__forceinline Matrix3::Matrix3( float scalar ) +{ + mCol0 = Vector3( scalar ); + mCol1 = Vector3( scalar ); + mCol2 = Vector3( scalar ); +} + +__forceinline Matrix3::Matrix3( const floatInVec &scalar ) +{ + mCol0 = Vector3( scalar ); + mCol1 = Vector3( scalar ); + mCol2 = Vector3( scalar ); +} + +__forceinline Matrix3::Matrix3( const Quat &unitQuat ) +{ + __m128 xyzw_2, wwww, yzxw, zxyw, yzxw_2, zxyw_2; + __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __declspec(align(16)) unsigned int sx[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int sz[4] = {0, 0, 0xffffffff, 0}; + __m128 select_x = _mm_load_ps((float *)sx); + __m128 select_z = _mm_load_ps((float *)sz); + + xyzw_2 = _mm_add_ps( unitQuat.get128(), unitQuat.get128() ); + wwww = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,3,3,3) ); + yzxw = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,0,2,1) ); + zxyw = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,1,0,2) ); + yzxw_2 = _mm_shuffle_ps( xyzw_2, xyzw_2, _MM_SHUFFLE(3,0,2,1) ); + zxyw_2 = _mm_shuffle_ps( xyzw_2, xyzw_2, _MM_SHUFFLE(3,1,0,2) ); + + tmp0 = _mm_mul_ps( yzxw_2, wwww ); // tmp0 = 2yw, 2zw, 2xw, 2w2 + tmp1 = _mm_sub_ps( _mm_set1_ps(1.0f), _mm_mul_ps(yzxw, yzxw_2) ); // tmp1 = 1 - 2y2, 1 - 2z2, 1 - 2x2, 1 - 2w2 + tmp2 = _mm_mul_ps( yzxw, xyzw_2 ); // tmp2 = 2xy, 2yz, 2xz, 2w2 + tmp0 = _mm_add_ps( _mm_mul_ps(zxyw, xyzw_2), tmp0 ); // tmp0 = 2yw + 2zx, 2zw + 2xy, 2xw + 2yz, 2w2 + 2w2 + tmp1 = _mm_sub_ps( tmp1, _mm_mul_ps(zxyw, zxyw_2) ); // tmp1 = 1 - 2y2 - 2z2, 1 - 2z2 - 2x2, 1 - 2x2 - 2y2, 1 - 2w2 - 2w2 + tmp2 = _mm_sub_ps( tmp2, _mm_mul_ps(zxyw_2, wwww) ); // tmp2 = 2xy - 2zw, 2yz - 2xw, 2xz - 2yw, 2w2 -2w2 + + tmp3 = vec_sel( tmp0, tmp1, select_x ); + tmp4 = vec_sel( tmp1, tmp2, select_x ); + tmp5 = vec_sel( tmp2, tmp0, select_x ); + mCol0 = Vector3( vec_sel( tmp3, tmp2, select_z ) ); + mCol1 = Vector3( vec_sel( tmp4, tmp0, select_z ) ); + mCol2 = Vector3( vec_sel( tmp5, tmp1, select_z ) ); +} + +__forceinline Matrix3::Matrix3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2 ) +{ + mCol0 = _col0; + mCol1 = _col1; + mCol2 = _col2; +} + +__forceinline Matrix3 & Matrix3::setCol0( const Vector3 &_col0 ) +{ + mCol0 = _col0; + return *this; +} + +__forceinline Matrix3 & Matrix3::setCol1( const Vector3 &_col1 ) +{ + mCol1 = _col1; + return *this; +} + +__forceinline Matrix3 & Matrix3::setCol2( const Vector3 &_col2 ) +{ + mCol2 = _col2; + return *this; +} + +__forceinline Matrix3 & Matrix3::setCol( int col, const Vector3 &vec ) +{ + *(&mCol0 + col) = vec; + return *this; +} + +__forceinline Matrix3 & Matrix3::setRow( int row, const Vector3 &vec ) +{ + mCol0.setElem( row, vec.getElem( 0 ) ); + mCol1.setElem( row, vec.getElem( 1 ) ); + mCol2.setElem( row, vec.getElem( 2 ) ); + return *this; +} + +__forceinline Matrix3 & Matrix3::setElem( int col, int row, float val ) +{ + (*this)[col].setElem(row, val); + return *this; +} + +__forceinline Matrix3 & Matrix3::setElem( int col, int row, const floatInVec &val ) +{ + Vector3 tmpV3_0; + tmpV3_0 = this->getCol( col ); + tmpV3_0.setElem( row, val ); + this->setCol( col, tmpV3_0 ); + return *this; +} + +__forceinline const floatInVec Matrix3::getElem( int col, int row ) const +{ + return this->getCol( col ).getElem( row ); +} + +__forceinline const Vector3 Matrix3::getCol0( ) const +{ + return mCol0; +} + +__forceinline const Vector3 Matrix3::getCol1( ) const +{ + return mCol1; +} + +__forceinline const Vector3 Matrix3::getCol2( ) const +{ + return mCol2; +} + +__forceinline const Vector3 Matrix3::getCol( int col ) const +{ + return *(&mCol0 + col); +} + +__forceinline const Vector3 Matrix3::getRow( int row ) const +{ + return Vector3( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ) ); +} + +__forceinline Vector3 & Matrix3::operator []( int col ) +{ + return *(&mCol0 + col); +} + +__forceinline const Vector3 Matrix3::operator []( int col ) const +{ + return *(&mCol0 + col); +} + +__forceinline Matrix3 & Matrix3::operator =( const Matrix3 & mat ) +{ + mCol0 = mat.mCol0; + mCol1 = mat.mCol1; + mCol2 = mat.mCol2; + return *this; +} + +__forceinline const Matrix3 transpose( const Matrix3 & mat ) +{ + __m128 tmp0, tmp1, res0, res1, res2; + tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() ); + tmp1 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() ); + res0 = vec_mergeh( tmp0, mat.getCol1().get128() ); + //res1 = vec_perm( tmp0, mat.getCol1().get128(), _VECTORMATH_PERM_ZBWX ); + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + res1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2)); + res1 = vec_sel(res1, mat.getCol1().get128(), select_y); + //res2 = vec_perm( tmp1, mat.getCol1().get128(), _VECTORMATH_PERM_XCYX ); + res2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0)); + res2 = vec_sel(res2, vec_splat(mat.getCol1().get128(), 2), select_y); + return Matrix3( + Vector3( res0 ), + Vector3( res1 ), + Vector3( res2 ) + ); +} + +__forceinline const Matrix3 inverse( const Matrix3 & mat ) +{ + __m128 tmp0, tmp1, tmp2, tmp3, tmp4, dot, invdet, inv0, inv1, inv2; + tmp2 = _vmathVfCross( mat.getCol0().get128(), mat.getCol1().get128() ); + tmp0 = _vmathVfCross( mat.getCol1().get128(), mat.getCol2().get128() ); + tmp1 = _vmathVfCross( mat.getCol2().get128(), mat.getCol0().get128() ); + dot = _vmathVfDot3( tmp2, mat.getCol2().get128() ); + dot = vec_splat( dot, 0 ); + invdet = recipf4( dot ); + tmp3 = vec_mergeh( tmp0, tmp2 ); + tmp4 = vec_mergel( tmp0, tmp2 ); + inv0 = vec_mergeh( tmp3, tmp1 ); + //inv1 = vec_perm( tmp3, tmp1, _VECTORMATH_PERM_ZBWX ); + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + inv1 = _mm_shuffle_ps( tmp3, tmp3, _MM_SHUFFLE(0,3,2,2)); + inv1 = vec_sel(inv1, tmp1, select_y); + //inv2 = vec_perm( tmp4, tmp1, _VECTORMATH_PERM_XCYX ); + inv2 = _mm_shuffle_ps( tmp4, tmp4, _MM_SHUFFLE(0,1,1,0)); + inv2 = vec_sel(inv2, vec_splat(tmp1, 2), select_y); + inv0 = vec_mul( inv0, invdet ); + inv1 = vec_mul( inv1, invdet ); + inv2 = vec_mul( inv2, invdet ); + return Matrix3( + Vector3( inv0 ), + Vector3( inv1 ), + Vector3( inv2 ) + ); +} + +__forceinline const floatInVec determinant( const Matrix3 & mat ) +{ + return dot( mat.getCol2(), cross( mat.getCol0(), mat.getCol1() ) ); +} + +__forceinline const Matrix3 Matrix3::operator +( const Matrix3 & mat ) const +{ + return Matrix3( + ( mCol0 + mat.mCol0 ), + ( mCol1 + mat.mCol1 ), + ( mCol2 + mat.mCol2 ) + ); +} + +__forceinline const Matrix3 Matrix3::operator -( const Matrix3 & mat ) const +{ + return Matrix3( + ( mCol0 - mat.mCol0 ), + ( mCol1 - mat.mCol1 ), + ( mCol2 - mat.mCol2 ) + ); +} + +__forceinline Matrix3 & Matrix3::operator +=( const Matrix3 & mat ) +{ + *this = *this + mat; + return *this; +} + +__forceinline Matrix3 & Matrix3::operator -=( const Matrix3 & mat ) +{ + *this = *this - mat; + return *this; +} + +__forceinline const Matrix3 Matrix3::operator -( ) const +{ + return Matrix3( + ( -mCol0 ), + ( -mCol1 ), + ( -mCol2 ) + ); +} + +__forceinline const Matrix3 absPerElem( const Matrix3 & mat ) +{ + return Matrix3( + absPerElem( mat.getCol0() ), + absPerElem( mat.getCol1() ), + absPerElem( mat.getCol2() ) + ); +} + +__forceinline const Matrix3 Matrix3::operator *( float scalar ) const +{ + return *this * floatInVec(scalar); +} + +__forceinline const Matrix3 Matrix3::operator *( const floatInVec &scalar ) const +{ + return Matrix3( + ( mCol0 * scalar ), + ( mCol1 * scalar ), + ( mCol2 * scalar ) + ); +} + +__forceinline Matrix3 & Matrix3::operator *=( float scalar ) +{ + return *this *= floatInVec(scalar); +} + +__forceinline Matrix3 & Matrix3::operator *=( const floatInVec &scalar ) +{ + *this = *this * scalar; + return *this; +} + +__forceinline const Matrix3 operator *( float scalar, const Matrix3 & mat ) +{ + return floatInVec(scalar) * mat; +} + +__forceinline const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat ) +{ + return mat * scalar; +} + +__forceinline const Vector3 Matrix3::operator *( const Vector3 &vec ) const +{ + __m128 res; + __m128 xxxx, yyyy, zzzz; + xxxx = vec_splat( vec.get128(), 0 ); + yyyy = vec_splat( vec.get128(), 1 ); + zzzz = vec_splat( vec.get128(), 2 ); + res = vec_mul( mCol0.get128(), xxxx ); + res = vec_madd( mCol1.get128(), yyyy, res ); + res = vec_madd( mCol2.get128(), zzzz, res ); + return Vector3( res ); +} + +__forceinline const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const +{ + return Matrix3( + ( *this * mat.mCol0 ), + ( *this * mat.mCol1 ), + ( *this * mat.mCol2 ) + ); +} + +__forceinline Matrix3 & Matrix3::operator *=( const Matrix3 & mat ) +{ + *this = *this * mat; + return *this; +} + +__forceinline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 ) +{ + return Matrix3( + mulPerElem( mat0.getCol0(), mat1.getCol0() ), + mulPerElem( mat0.getCol1(), mat1.getCol1() ), + mulPerElem( mat0.getCol2(), mat1.getCol2() ) + ); +} + +__forceinline const Matrix3 Matrix3::identity( ) +{ + return Matrix3( + Vector3::xAxis( ), + Vector3::yAxis( ), + Vector3::zAxis( ) + ); +} + +__forceinline const Matrix3 Matrix3::rotationX( float radians ) +{ + return rotationX( floatInVec(radians) ); +} + +__forceinline const Matrix3 Matrix3::rotationX( const floatInVec &radians ) +{ + __m128 s, c, res1, res2; + __m128 zero; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + zero = _mm_setzero_ps(); + sincosf4( radians.get128(), &s, &c ); + res1 = vec_sel( zero, c, select_y ); + res1 = vec_sel( res1, s, select_z ); + res2 = vec_sel( zero, negatef4(s), select_y ); + res2 = vec_sel( res2, c, select_z ); + return Matrix3( + Vector3::xAxis( ), + Vector3( res1 ), + Vector3( res2 ) + ); +} + +__forceinline const Matrix3 Matrix3::rotationY( float radians ) +{ + return rotationY( floatInVec(radians) ); +} + +__forceinline const Matrix3 Matrix3::rotationY( const floatInVec &radians ) +{ + __m128 s, c, res0, res2; + __m128 zero; + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + zero = _mm_setzero_ps(); + sincosf4( radians.get128(), &s, &c ); + res0 = vec_sel( zero, c, select_x ); + res0 = vec_sel( res0, negatef4(s), select_z ); + res2 = vec_sel( zero, s, select_x ); + res2 = vec_sel( res2, c, select_z ); + return Matrix3( + Vector3( res0 ), + Vector3::yAxis( ), + Vector3( res2 ) + ); +} + +__forceinline const Matrix3 Matrix3::rotationZ( float radians ) +{ + return rotationZ( floatInVec(radians) ); +} + +__forceinline const Matrix3 Matrix3::rotationZ( const floatInVec &radians ) +{ + __m128 s, c, res0, res1; + __m128 zero; + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + zero = _mm_setzero_ps(); + sincosf4( radians.get128(), &s, &c ); + res0 = vec_sel( zero, c, select_x ); + res0 = vec_sel( res0, s, select_y ); + res1 = vec_sel( zero, negatef4(s), select_x ); + res1 = vec_sel( res1, c, select_y ); + return Matrix3( + Vector3( res0 ), + Vector3( res1 ), + Vector3::zAxis( ) + ); +} + +__forceinline const Matrix3 Matrix3::rotationZYX( const Vector3 &radiansXYZ ) +{ + __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp; + angles = Vector4( radiansXYZ, 0.0f ).get128(); + sincosf4( angles, &s, &c ); + negS = negatef4( s ); + Z0 = vec_mergel( c, s ); + Z1 = vec_mergel( negS, c ); + __declspec(align(16)) unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0}; + Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) ); + Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) ); + Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) ); + X0 = vec_splat( s, 0 ); + X1 = vec_splat( c, 0 ); + tmp = vec_mul( Z0, Y1 ); + return Matrix3( + Vector3( vec_mul( Z0, Y0 ) ), + Vector3( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ), + Vector3( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) ) + ); +} + +__forceinline const Matrix3 Matrix3::rotation( float radians, const Vector3 &unitVec ) +{ + return rotation( floatInVec(radians), unitVec ); +} + +__forceinline const Matrix3 Matrix3::rotation( const floatInVec &radians, const Vector3 &unitVec ) +{ + __m128 axis, s, c, oneMinusC, axisS, negAxisS, xxxx, yyyy, zzzz, tmp0, tmp1, tmp2; + axis = unitVec.get128(); + sincosf4( radians.get128(), &s, &c ); + xxxx = vec_splat( axis, 0 ); + yyyy = vec_splat( axis, 1 ); + zzzz = vec_splat( axis, 2 ); + oneMinusC = vec_sub( _mm_set1_ps(1.0f), c ); + axisS = vec_mul( axis, s ); + negAxisS = negatef4( axisS ); + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + //tmp0 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_XZBX ); + tmp0 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,2,0) ); + tmp0 = vec_sel(tmp0, vec_splat(negAxisS, 1), select_z); + //tmp1 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_CXXX ); + tmp1 = vec_sel( vec_splat(axisS, 0), vec_splat(negAxisS, 2), select_x ); + //tmp2 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_YAXX ); + tmp2 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,0,1) ); + tmp2 = vec_sel(tmp2, vec_splat(negAxisS, 0), select_y); + tmp0 = vec_sel( tmp0, c, select_x ); + tmp1 = vec_sel( tmp1, c, select_y ); + tmp2 = vec_sel( tmp2, c, select_z ); + return Matrix3( + Vector3( vec_madd( vec_mul( axis, xxxx ), oneMinusC, tmp0 ) ), + Vector3( vec_madd( vec_mul( axis, yyyy ), oneMinusC, tmp1 ) ), + Vector3( vec_madd( vec_mul( axis, zzzz ), oneMinusC, tmp2 ) ) + ); +} + +__forceinline const Matrix3 Matrix3::rotation( const Quat &unitQuat ) +{ + return Matrix3( unitQuat ); +} + +__forceinline const Matrix3 Matrix3::scale( const Vector3 &scaleVec ) +{ + __m128 zero = _mm_setzero_ps(); + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + return Matrix3( + Vector3( vec_sel( zero, scaleVec.get128(), select_x ) ), + Vector3( vec_sel( zero, scaleVec.get128(), select_y ) ), + Vector3( vec_sel( zero, scaleVec.get128(), select_z ) ) + ); +} + +__forceinline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec ) +{ + return Matrix3( + ( mat.getCol0() * scaleVec.getX( ) ), + ( mat.getCol1() * scaleVec.getY( ) ), + ( mat.getCol2() * scaleVec.getZ( ) ) + ); +} + +__forceinline const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat ) +{ + return Matrix3( + mulPerElem( mat.getCol0(), scaleVec ), + mulPerElem( mat.getCol1(), scaleVec ), + mulPerElem( mat.getCol2(), scaleVec ) + ); +} + +__forceinline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 ) +{ + return Matrix3( + select( mat0.getCol0(), mat1.getCol0(), select1 ), + select( mat0.getCol1(), mat1.getCol1(), select1 ), + select( mat0.getCol2(), mat1.getCol2(), select1 ) + ); +} + +__forceinline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 ) +{ + return Matrix3( + select( mat0.getCol0(), mat1.getCol0(), select1 ), + select( mat0.getCol1(), mat1.getCol1(), select1 ), + select( mat0.getCol2(), mat1.getCol2(), select1 ) + ); +} + +#ifdef _VECTORMATH_DEBUG + +__forceinline void print( const Matrix3 & mat ) +{ + print( mat.getRow( 0 ) ); + print( mat.getRow( 1 ) ); + print( mat.getRow( 2 ) ); +} + +__forceinline void print( const Matrix3 & mat, const char * name ) +{ + printf("%s:\n", name); + print( mat ); +} + +#endif + +__forceinline Matrix4::Matrix4( const Matrix4 & mat ) +{ + mCol0 = mat.mCol0; + mCol1 = mat.mCol1; + mCol2 = mat.mCol2; + mCol3 = mat.mCol3; +} + +__forceinline Matrix4::Matrix4( float scalar ) +{ + mCol0 = Vector4( scalar ); + mCol1 = Vector4( scalar ); + mCol2 = Vector4( scalar ); + mCol3 = Vector4( scalar ); +} + +__forceinline Matrix4::Matrix4( const floatInVec &scalar ) +{ + mCol0 = Vector4( scalar ); + mCol1 = Vector4( scalar ); + mCol2 = Vector4( scalar ); + mCol3 = Vector4( scalar ); +} + +__forceinline Matrix4::Matrix4( const Transform3 & mat ) +{ + mCol0 = Vector4( mat.getCol0(), 0.0f ); + mCol1 = Vector4( mat.getCol1(), 0.0f ); + mCol2 = Vector4( mat.getCol2(), 0.0f ); + mCol3 = Vector4( mat.getCol3(), 1.0f ); +} + +__forceinline Matrix4::Matrix4( const Vector4 &_col0, const Vector4 &_col1, const Vector4 &_col2, const Vector4 &_col3 ) +{ + mCol0 = _col0; + mCol1 = _col1; + mCol2 = _col2; + mCol3 = _col3; +} + +__forceinline Matrix4::Matrix4( const Matrix3 & mat, const Vector3 &translateVec ) +{ + mCol0 = Vector4( mat.getCol0(), 0.0f ); + mCol1 = Vector4( mat.getCol1(), 0.0f ); + mCol2 = Vector4( mat.getCol2(), 0.0f ); + mCol3 = Vector4( translateVec, 1.0f ); +} + +__forceinline Matrix4::Matrix4( const Quat &unitQuat, const Vector3 &translateVec ) +{ + Matrix3 mat; + mat = Matrix3( unitQuat ); + mCol0 = Vector4( mat.getCol0(), 0.0f ); + mCol1 = Vector4( mat.getCol1(), 0.0f ); + mCol2 = Vector4( mat.getCol2(), 0.0f ); + mCol3 = Vector4( translateVec, 1.0f ); +} + +__forceinline Matrix4 & Matrix4::setCol0( const Vector4 &_col0 ) +{ + mCol0 = _col0; + return *this; +} + +__forceinline Matrix4 & Matrix4::setCol1( const Vector4 &_col1 ) +{ + mCol1 = _col1; + return *this; +} + +__forceinline Matrix4 & Matrix4::setCol2( const Vector4 &_col2 ) +{ + mCol2 = _col2; + return *this; +} + +__forceinline Matrix4 & Matrix4::setCol3( const Vector4 &_col3 ) +{ + mCol3 = _col3; + return *this; +} + +__forceinline Matrix4 & Matrix4::setCol( int col, const Vector4 &vec ) +{ + *(&mCol0 + col) = vec; + return *this; +} + +__forceinline Matrix4 & Matrix4::setRow( int row, const Vector4 &vec ) +{ + mCol0.setElem( row, vec.getElem( 0 ) ); + mCol1.setElem( row, vec.getElem( 1 ) ); + mCol2.setElem( row, vec.getElem( 2 ) ); + mCol3.setElem( row, vec.getElem( 3 ) ); + return *this; +} + +__forceinline Matrix4 & Matrix4::setElem( int col, int row, float val ) +{ + (*this)[col].setElem(row, val); + return *this; +} + +__forceinline Matrix4 & Matrix4::setElem( int col, int row, const floatInVec &val ) +{ + Vector4 tmpV3_0; + tmpV3_0 = this->getCol( col ); + tmpV3_0.setElem( row, val ); + this->setCol( col, tmpV3_0 ); + return *this; +} + +__forceinline const floatInVec Matrix4::getElem( int col, int row ) const +{ + return this->getCol( col ).getElem( row ); +} + +__forceinline const Vector4 Matrix4::getCol0( ) const +{ + return mCol0; +} + +__forceinline const Vector4 Matrix4::getCol1( ) const +{ + return mCol1; +} + +__forceinline const Vector4 Matrix4::getCol2( ) const +{ + return mCol2; +} + +__forceinline const Vector4 Matrix4::getCol3( ) const +{ + return mCol3; +} + +__forceinline const Vector4 Matrix4::getCol( int col ) const +{ + return *(&mCol0 + col); +} + +__forceinline const Vector4 Matrix4::getRow( int row ) const +{ + return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) ); +} + +__forceinline Vector4 & Matrix4::operator []( int col ) +{ + return *(&mCol0 + col); +} + +__forceinline const Vector4 Matrix4::operator []( int col ) const +{ + return *(&mCol0 + col); +} + +__forceinline Matrix4 & Matrix4::operator =( const Matrix4 & mat ) +{ + mCol0 = mat.mCol0; + mCol1 = mat.mCol1; + mCol2 = mat.mCol2; + mCol3 = mat.mCol3; + return *this; +} + +__forceinline const Matrix4 transpose( const Matrix4 & mat ) +{ + __m128 tmp0, tmp1, tmp2, tmp3, res0, res1, res2, res3; + tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() ); + tmp1 = vec_mergeh( mat.getCol1().get128(), mat.getCol3().get128() ); + tmp2 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() ); + tmp3 = vec_mergel( mat.getCol1().get128(), mat.getCol3().get128() ); + res0 = vec_mergeh( tmp0, tmp1 ); + res1 = vec_mergel( tmp0, tmp1 ); + res2 = vec_mergeh( tmp2, tmp3 ); + res3 = vec_mergel( tmp2, tmp3 ); + return Matrix4( + Vector4( res0 ), + Vector4( res1 ), + Vector4( res2 ), + Vector4( res3 ) + ); +} + +// TODO: Tidy +static __declspec(align(16)) const unsigned int _vmathPNPN[4] = {0x00000000, 0x80000000, 0x00000000, 0x80000000}; +static __declspec(align(16)) const unsigned int _vmathNPNP[4] = {0x80000000, 0x00000000, 0x80000000, 0x00000000}; +static __declspec(align(16)) const float _vmathZERONE[4] = {1.0f, 0.0f, 0.0f, 1.0f}; + +__forceinline const Matrix4 inverse( const Matrix4 & mat ) +{ + __m128 Va,Vb,Vc; + __m128 r1,r2,r3,tt,tt2; + __m128 sum,Det,RDet; + __m128 trns0,trns1,trns2,trns3; + + __m128 _L1 = mat.getCol0().get128(); + __m128 _L2 = mat.getCol1().get128(); + __m128 _L3 = mat.getCol2().get128(); + __m128 _L4 = mat.getCol3().get128(); + // Calculating the minterms for the first line. + + // _mm_ror_ps is just a macro using _mm_shuffle_ps(). + tt = _L4; tt2 = _mm_ror_ps(_L3,1); + Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0)); // V3'·V4 + Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2)); // V3'·V4" + Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3)); // V3'·V4^ + + r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2)); // V3"·V4^ - V3^·V4" + r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0)); // V3^·V4' - V3'·V4^ + r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1)); // V3'·V4" - V3"·V4' + + tt = _L2; + Va = _mm_ror_ps(tt,1); sum = _mm_mul_ps(Va,r1); + Vb = _mm_ror_ps(tt,2); sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2)); + Vc = _mm_ror_ps(tt,3); sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3)); + + // Calculating the determinant. + Det = _mm_mul_ps(sum,_L1); + Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det)); + + const __m128 Sign_PNPN = _mm_load_ps((float *)_vmathPNPN); + const __m128 Sign_NPNP = _mm_load_ps((float *)_vmathNPNP); + + __m128 mtL1 = _mm_xor_ps(sum,Sign_PNPN); + + // Calculating the minterms of the second line (using previous results). + tt = _mm_ror_ps(_L1,1); sum = _mm_mul_ps(tt,r1); + tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2)); + tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3)); + __m128 mtL2 = _mm_xor_ps(sum,Sign_NPNP); + + // Testing the determinant. + Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1)); + + // Calculating the minterms of the third line. + tt = _mm_ror_ps(_L1,1); + Va = _mm_mul_ps(tt,Vb); // V1'·V2" + Vb = _mm_mul_ps(tt,Vc); // V1'·V2^ + Vc = _mm_mul_ps(tt,_L2); // V1'·V2 + + r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2)); // V1"·V2^ - V1^·V2" + r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0)); // V1^·V2' - V1'·V2^ + r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1)); // V1'·V2" - V1"·V2' + + tt = _mm_ror_ps(_L4,1); sum = _mm_mul_ps(tt,r1); + tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2)); + tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3)); + __m128 mtL3 = _mm_xor_ps(sum,Sign_PNPN); + + // Dividing is FASTER than rcp_nr! (Because rcp_nr causes many register-memory RWs). + RDet = _mm_div_ss(_mm_load_ss((float *)&_vmathZERONE), Det); // TODO: just 1.0f? + RDet = _mm_shuffle_ps(RDet,RDet,0x00); + + // Devide the first 12 minterms with the determinant. + mtL1 = _mm_mul_ps(mtL1, RDet); + mtL2 = _mm_mul_ps(mtL2, RDet); + mtL3 = _mm_mul_ps(mtL3, RDet); + + // Calculate the minterms of the forth line and devide by the determinant. + tt = _mm_ror_ps(_L3,1); sum = _mm_mul_ps(tt,r1); + tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2)); + tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3)); + __m128 mtL4 = _mm_xor_ps(sum,Sign_NPNP); + mtL4 = _mm_mul_ps(mtL4, RDet); + + // Now we just have to transpose the minterms matrix. + trns0 = _mm_unpacklo_ps(mtL1,mtL2); + trns1 = _mm_unpacklo_ps(mtL3,mtL4); + trns2 = _mm_unpackhi_ps(mtL1,mtL2); + trns3 = _mm_unpackhi_ps(mtL3,mtL4); + _L1 = _mm_movelh_ps(trns0,trns1); + _L2 = _mm_movehl_ps(trns1,trns0); + _L3 = _mm_movelh_ps(trns2,trns3); + _L4 = _mm_movehl_ps(trns3,trns2); + + return Matrix4( + Vector4( _L1 ), + Vector4( _L2 ), + Vector4( _L3 ), + Vector4( _L4 ) + ); +} + +__forceinline const Matrix4 affineInverse( const Matrix4 & mat ) +{ + Transform3 affineMat; + affineMat.setCol0( mat.getCol0().getXYZ( ) ); + affineMat.setCol1( mat.getCol1().getXYZ( ) ); + affineMat.setCol2( mat.getCol2().getXYZ( ) ); + affineMat.setCol3( mat.getCol3().getXYZ( ) ); + return Matrix4( inverse( affineMat ) ); +} + +__forceinline const Matrix4 orthoInverse( const Matrix4 & mat ) +{ + Transform3 affineMat; + affineMat.setCol0( mat.getCol0().getXYZ( ) ); + affineMat.setCol1( mat.getCol1().getXYZ( ) ); + affineMat.setCol2( mat.getCol2().getXYZ( ) ); + affineMat.setCol3( mat.getCol3().getXYZ( ) ); + return Matrix4( orthoInverse( affineMat ) ); +} + +__forceinline const floatInVec determinant( const Matrix4 & mat ) +{ + __m128 Va,Vb,Vc; + __m128 r1,r2,r3,tt,tt2; + __m128 sum,Det; + + __m128 _L1 = mat.getCol0().get128(); + __m128 _L2 = mat.getCol1().get128(); + __m128 _L3 = mat.getCol2().get128(); + __m128 _L4 = mat.getCol3().get128(); + // Calculating the minterms for the first line. + + // _mm_ror_ps is just a macro using _mm_shuffle_ps(). + tt = _L4; tt2 = _mm_ror_ps(_L3,1); + Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0)); // V3'·V4 + Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2)); // V3'·V4" + Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3)); // V3'·V4^ + + r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2)); // V3"·V4^ - V3^·V4" + r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0)); // V3^·V4' - V3'·V4^ + r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1)); // V3'·V4" - V3"·V4' + + tt = _L2; + Va = _mm_ror_ps(tt,1); sum = _mm_mul_ps(Va,r1); + Vb = _mm_ror_ps(tt,2); sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2)); + Vc = _mm_ror_ps(tt,3); sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3)); + + // Calculating the determinant. + Det = _mm_mul_ps(sum,_L1); + Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det)); + + // Calculating the minterms of the second line (using previous results). + tt = _mm_ror_ps(_L1,1); sum = _mm_mul_ps(tt,r1); + tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2)); + tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3)); + + // Testing the determinant. + Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1)); + return floatInVec(Det, 0); +} + +__forceinline const Matrix4 Matrix4::operator +( const Matrix4 & mat ) const +{ + return Matrix4( + ( mCol0 + mat.mCol0 ), + ( mCol1 + mat.mCol1 ), + ( mCol2 + mat.mCol2 ), + ( mCol3 + mat.mCol3 ) + ); +} + +__forceinline const Matrix4 Matrix4::operator -( const Matrix4 & mat ) const +{ + return Matrix4( + ( mCol0 - mat.mCol0 ), + ( mCol1 - mat.mCol1 ), + ( mCol2 - mat.mCol2 ), + ( mCol3 - mat.mCol3 ) + ); +} + +__forceinline Matrix4 & Matrix4::operator +=( const Matrix4 & mat ) +{ + *this = *this + mat; + return *this; +} + +__forceinline Matrix4 & Matrix4::operator -=( const Matrix4 & mat ) +{ + *this = *this - mat; + return *this; +} + +__forceinline const Matrix4 Matrix4::operator -( ) const +{ + return Matrix4( + ( -mCol0 ), + ( -mCol1 ), + ( -mCol2 ), + ( -mCol3 ) + ); +} + +__forceinline const Matrix4 absPerElem( const Matrix4 & mat ) +{ + return Matrix4( + absPerElem( mat.getCol0() ), + absPerElem( mat.getCol1() ), + absPerElem( mat.getCol2() ), + absPerElem( mat.getCol3() ) + ); +} + +__forceinline const Matrix4 Matrix4::operator *( float scalar ) const +{ + return *this * floatInVec(scalar); +} + +__forceinline const Matrix4 Matrix4::operator *( const floatInVec &scalar ) const +{ + return Matrix4( + ( mCol0 * scalar ), + ( mCol1 * scalar ), + ( mCol2 * scalar ), + ( mCol3 * scalar ) + ); +} + +__forceinline Matrix4 & Matrix4::operator *=( float scalar ) +{ + return *this *= floatInVec(scalar); +} + +__forceinline Matrix4 & Matrix4::operator *=( const floatInVec &scalar ) +{ + *this = *this * scalar; + return *this; +} + +__forceinline const Matrix4 operator *( float scalar, const Matrix4 & mat ) +{ + return floatInVec(scalar) * mat; +} + +__forceinline const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat ) +{ + return mat * scalar; +} + +__forceinline const Vector4 Matrix4::operator *( const Vector4 &vec ) const +{ + return Vector4( + _mm_add_ps( + _mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(1,1,1,1)))), + _mm_add_ps(_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(2,2,2,2))), _mm_mul_ps(mCol3.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(3,3,3,3))))) + ); +} + +__forceinline const Vector4 Matrix4::operator *( const Vector3 &vec ) const +{ + return Vector4( + _mm_add_ps( + _mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(1,1,1,1)))), + _mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(2,2,2,2)))) + ); +} + +__forceinline const Vector4 Matrix4::operator *( const Point3 &pnt ) const +{ + return Vector4( + _mm_add_ps( + _mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(1,1,1,1)))), + _mm_add_ps(_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(2,2,2,2))), mCol3.get128())) + ); +} + +__forceinline const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const +{ + return Matrix4( + ( *this * mat.mCol0 ), + ( *this * mat.mCol1 ), + ( *this * mat.mCol2 ), + ( *this * mat.mCol3 ) + ); +} + +__forceinline Matrix4 & Matrix4::operator *=( const Matrix4 & mat ) +{ + *this = *this * mat; + return *this; +} + +__forceinline const Matrix4 Matrix4::operator *( const Transform3 & tfrm ) const +{ + return Matrix4( + ( *this * tfrm.getCol0() ), + ( *this * tfrm.getCol1() ), + ( *this * tfrm.getCol2() ), + ( *this * Point3( tfrm.getCol3() ) ) + ); +} + +__forceinline Matrix4 & Matrix4::operator *=( const Transform3 & tfrm ) +{ + *this = *this * tfrm; + return *this; +} + +__forceinline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 ) +{ + return Matrix4( + mulPerElem( mat0.getCol0(), mat1.getCol0() ), + mulPerElem( mat0.getCol1(), mat1.getCol1() ), + mulPerElem( mat0.getCol2(), mat1.getCol2() ), + mulPerElem( mat0.getCol3(), mat1.getCol3() ) + ); +} + +__forceinline const Matrix4 Matrix4::identity( ) +{ + return Matrix4( + Vector4::xAxis( ), + Vector4::yAxis( ), + Vector4::zAxis( ), + Vector4::wAxis( ) + ); +} + +__forceinline Matrix4 & Matrix4::setUpper3x3( const Matrix3 & mat3 ) +{ + mCol0.setXYZ( mat3.getCol0() ); + mCol1.setXYZ( mat3.getCol1() ); + mCol2.setXYZ( mat3.getCol2() ); + return *this; +} + +__forceinline const Matrix3 Matrix4::getUpper3x3( ) const +{ + return Matrix3( + mCol0.getXYZ( ), + mCol1.getXYZ( ), + mCol2.getXYZ( ) + ); +} + +__forceinline Matrix4 & Matrix4::setTranslation( const Vector3 &translateVec ) +{ + mCol3.setXYZ( translateVec ); + return *this; +} + +__forceinline const Vector3 Matrix4::getTranslation( ) const +{ + return mCol3.getXYZ( ); +} + +__forceinline const Matrix4 Matrix4::rotationX( float radians ) +{ + return rotationX( floatInVec(radians) ); +} + +__forceinline const Matrix4 Matrix4::rotationX( const floatInVec &radians ) +{ + __m128 s, c, res1, res2; + __m128 zero; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + zero = _mm_setzero_ps(); + sincosf4( radians.get128(), &s, &c ); + res1 = vec_sel( zero, c, select_y ); + res1 = vec_sel( res1, s, select_z ); + res2 = vec_sel( zero, negatef4(s), select_y ); + res2 = vec_sel( res2, c, select_z ); + return Matrix4( + Vector4::xAxis( ), + Vector4( res1 ), + Vector4( res2 ), + Vector4::wAxis( ) + ); +} + +__forceinline const Matrix4 Matrix4::rotationY( float radians ) +{ + return rotationY( floatInVec(radians) ); +} + +__forceinline const Matrix4 Matrix4::rotationY( const floatInVec &radians ) +{ + __m128 s, c, res0, res2; + __m128 zero; + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + zero = _mm_setzero_ps(); + sincosf4( radians.get128(), &s, &c ); + res0 = vec_sel( zero, c, select_x ); + res0 = vec_sel( res0, negatef4(s), select_z ); + res2 = vec_sel( zero, s, select_x ); + res2 = vec_sel( res2, c, select_z ); + return Matrix4( + Vector4( res0 ), + Vector4::yAxis( ), + Vector4( res2 ), + Vector4::wAxis( ) + ); +} + +__forceinline const Matrix4 Matrix4::rotationZ( float radians ) +{ + return rotationZ( floatInVec(radians) ); +} + +__forceinline const Matrix4 Matrix4::rotationZ( const floatInVec &radians ) +{ + __m128 s, c, res0, res1; + __m128 zero; + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + zero = _mm_setzero_ps(); + sincosf4( radians.get128(), &s, &c ); + res0 = vec_sel( zero, c, select_x ); + res0 = vec_sel( res0, s, select_y ); + res1 = vec_sel( zero, negatef4(s), select_x ); + res1 = vec_sel( res1, c, select_y ); + return Matrix4( + Vector4( res0 ), + Vector4( res1 ), + Vector4::zAxis( ), + Vector4::wAxis( ) + ); +} + +__forceinline const Matrix4 Matrix4::rotationZYX( const Vector3 &radiansXYZ ) +{ + __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp; + angles = Vector4( radiansXYZ, 0.0f ).get128(); + sincosf4( angles, &s, &c ); + negS = negatef4( s ); + Z0 = vec_mergel( c, s ); + Z1 = vec_mergel( negS, c ); + __declspec(align(16)) unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0}; + Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) ); + Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) ); + Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) ); + X0 = vec_splat( s, 0 ); + X1 = vec_splat( c, 0 ); + tmp = vec_mul( Z0, Y1 ); + return Matrix4( + Vector4( vec_mul( Z0, Y0 ) ), + Vector4( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ), + Vector4( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) ), + Vector4::wAxis( ) + ); +} + +__forceinline const Matrix4 Matrix4::rotation( float radians, const Vector3 &unitVec ) +{ + return rotation( floatInVec(radians), unitVec ); +} + +__forceinline const Matrix4 Matrix4::rotation( const floatInVec &radians, const Vector3 &unitVec ) +{ + __m128 axis, s, c, oneMinusC, axisS, negAxisS, xxxx, yyyy, zzzz, tmp0, tmp1, tmp2; + axis = unitVec.get128(); + sincosf4( radians.get128(), &s, &c ); + xxxx = vec_splat( axis, 0 ); + yyyy = vec_splat( axis, 1 ); + zzzz = vec_splat( axis, 2 ); + oneMinusC = vec_sub( _mm_set1_ps(1.0f), c ); + axisS = vec_mul( axis, s ); + negAxisS = negatef4( axisS ); + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + //tmp0 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_XZBX ); + tmp0 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,2,0) ); + tmp0 = vec_sel(tmp0, vec_splat(negAxisS, 1), select_z); + //tmp1 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_CXXX ); + tmp1 = vec_sel( vec_splat(axisS, 0), vec_splat(negAxisS, 2), select_x ); + //tmp2 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_YAXX ); + tmp2 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,0,1) ); + tmp2 = vec_sel(tmp2, vec_splat(negAxisS, 0), select_y); + tmp0 = vec_sel( tmp0, c, select_x ); + tmp1 = vec_sel( tmp1, c, select_y ); + tmp2 = vec_sel( tmp2, c, select_z ); + __declspec(align(16)) unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0}; + axis = vec_and( axis, _mm_load_ps( (float *)select_xyz ) ); + tmp0 = vec_and( tmp0, _mm_load_ps( (float *)select_xyz ) ); + tmp1 = vec_and( tmp1, _mm_load_ps( (float *)select_xyz ) ); + tmp2 = vec_and( tmp2, _mm_load_ps( (float *)select_xyz ) ); + return Matrix4( + Vector4( vec_madd( vec_mul( axis, xxxx ), oneMinusC, tmp0 ) ), + Vector4( vec_madd( vec_mul( axis, yyyy ), oneMinusC, tmp1 ) ), + Vector4( vec_madd( vec_mul( axis, zzzz ), oneMinusC, tmp2 ) ), + Vector4::wAxis( ) + ); +} + +__forceinline const Matrix4 Matrix4::rotation( const Quat &unitQuat ) +{ + return Matrix4( Transform3::rotation( unitQuat ) ); +} + +__forceinline const Matrix4 Matrix4::scale( const Vector3 &scaleVec ) +{ + __m128 zero = _mm_setzero_ps(); + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + return Matrix4( + Vector4( vec_sel( zero, scaleVec.get128(), select_x ) ), + Vector4( vec_sel( zero, scaleVec.get128(), select_y ) ), + Vector4( vec_sel( zero, scaleVec.get128(), select_z ) ), + Vector4::wAxis( ) + ); +} + +__forceinline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec ) +{ + return Matrix4( + ( mat.getCol0() * scaleVec.getX( ) ), + ( mat.getCol1() * scaleVec.getY( ) ), + ( mat.getCol2() * scaleVec.getZ( ) ), + mat.getCol3() + ); +} + +__forceinline const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat ) +{ + Vector4 scale4; + scale4 = Vector4( scaleVec, 1.0f ); + return Matrix4( + mulPerElem( mat.getCol0(), scale4 ), + mulPerElem( mat.getCol1(), scale4 ), + mulPerElem( mat.getCol2(), scale4 ), + mulPerElem( mat.getCol3(), scale4 ) + ); +} + +__forceinline const Matrix4 Matrix4::translation( const Vector3 &translateVec ) +{ + return Matrix4( + Vector4::xAxis( ), + Vector4::yAxis( ), + Vector4::zAxis( ), + Vector4( translateVec, 1.0f ) + ); +} + +__forceinline const Matrix4 Matrix4::lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec ) +{ + Matrix4 m4EyeFrame; + Vector3 v3X, v3Y, v3Z; + v3Y = normalize( upVec ); + v3Z = normalize( ( eyePos - lookAtPos ) ); + v3X = normalize( cross( v3Y, v3Z ) ); + v3Y = cross( v3Z, v3X ); + m4EyeFrame = Matrix4( Vector4( v3X ), Vector4( v3Y ), Vector4( v3Z ), Vector4( eyePos ) ); + return orthoInverse( m4EyeFrame ); +} + +__forceinline const Matrix4 Matrix4::perspective( float fovyRadians, float aspect, float zNear, float zFar ) +{ + float f, rangeInv; + __m128 zero, col0, col1, col2, col3; + union { __m128 v; float s[4]; } tmp; + f = tanf( _VECTORMATH_PI_OVER_2 - fovyRadians * 0.5f ); + rangeInv = 1.0f / ( zNear - zFar ); + zero = _mm_setzero_ps(); + tmp.v = zero; + tmp.s[0] = f / aspect; + col0 = tmp.v; + tmp.v = zero; + tmp.s[1] = f; + col1 = tmp.v; + tmp.v = zero; + tmp.s[2] = ( zNear + zFar ) * rangeInv; + tmp.s[3] = -1.0f; + col2 = tmp.v; + tmp.v = zero; + tmp.s[2] = zNear * zFar * rangeInv * 2.0f; + col3 = tmp.v; + return Matrix4( + Vector4( col0 ), + Vector4( col1 ), + Vector4( col2 ), + Vector4( col3 ) + ); +} + +__forceinline const Matrix4 Matrix4::frustum( float left, float right, float bottom, float top, float zNear, float zFar ) +{ + /* function implementation based on code from STIDC SDK: */ + /* -------------------------------------------------------------- */ + /* PLEASE DO NOT MODIFY THIS SECTION */ + /* This prolog section is automatically generated. */ + /* */ + /* (C)Copyright */ + /* Sony Computer Entertainment, Inc., */ + /* Toshiba Corporation, */ + /* International Business Machines Corporation, */ + /* 2001,2002. */ + /* S/T/I Confidential Information */ + /* -------------------------------------------------------------- */ + __m128 lbf, rtn; + __m128 diff, sum, inv_diff; + __m128 diagonal, column, near2; + __m128 zero = _mm_setzero_ps(); + union { __m128 v; float s[4]; } l, f, r, n, b, t; // TODO: Union? + l.s[0] = left; + f.s[0] = zFar; + r.s[0] = right; + n.s[0] = zNear; + b.s[0] = bottom; + t.s[0] = top; + lbf = vec_mergeh( l.v, f.v ); + rtn = vec_mergeh( r.v, n.v ); + lbf = vec_mergeh( lbf, b.v ); + rtn = vec_mergeh( rtn, t.v ); + diff = vec_sub( rtn, lbf ); + sum = vec_add( rtn, lbf ); + inv_diff = recipf4( diff ); + near2 = vec_splat( n.v, 0 ); + near2 = vec_add( near2, near2 ); + diagonal = vec_mul( near2, inv_diff ); + column = vec_mul( sum, inv_diff ); + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + __declspec(align(16)) unsigned int select_w[4] = {0, 0, 0, 0xffffffff}; + return Matrix4( + Vector4( vec_sel( zero, diagonal, select_x ) ), + Vector4( vec_sel( zero, diagonal, select_y ) ), + Vector4( vec_sel( column, _mm_set1_ps(-1.0f), select_w ) ), + Vector4( vec_sel( zero, vec_mul( diagonal, vec_splat( f.v, 0 ) ), select_z ) ) + ); +} + +__forceinline const Matrix4 Matrix4::orthographic( float left, float right, float bottom, float top, float zNear, float zFar ) +{ + /* function implementation based on code from STIDC SDK: */ + /* -------------------------------------------------------------- */ + /* PLEASE DO NOT MODIFY THIS SECTION */ + /* This prolog section is automatically generated. */ + /* */ + /* (C)Copyright */ + /* Sony Computer Entertainment, Inc., */ + /* Toshiba Corporation, */ + /* International Business Machines Corporation, */ + /* 2001,2002. */ + /* S/T/I Confidential Information */ + /* -------------------------------------------------------------- */ + __m128 lbf, rtn; + __m128 diff, sum, inv_diff, neg_inv_diff; + __m128 diagonal, column; + __m128 zero = _mm_setzero_ps(); + union { __m128 v; float s[4]; } l, f, r, n, b, t; + l.s[0] = left; + f.s[0] = zFar; + r.s[0] = right; + n.s[0] = zNear; + b.s[0] = bottom; + t.s[0] = top; + lbf = vec_mergeh( l.v, f.v ); + rtn = vec_mergeh( r.v, n.v ); + lbf = vec_mergeh( lbf, b.v ); + rtn = vec_mergeh( rtn, t.v ); + diff = vec_sub( rtn, lbf ); + sum = vec_add( rtn, lbf ); + inv_diff = recipf4( diff ); + neg_inv_diff = negatef4( inv_diff ); + diagonal = vec_add( inv_diff, inv_diff ); + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + __declspec(align(16)) unsigned int select_w[4] = {0, 0, 0, 0xffffffff}; + column = vec_mul( sum, vec_sel( neg_inv_diff, inv_diff, select_z ) ); // TODO: no madds with zero + return Matrix4( + Vector4( vec_sel( zero, diagonal, select_x ) ), + Vector4( vec_sel( zero, diagonal, select_y ) ), + Vector4( vec_sel( zero, diagonal, select_z ) ), + Vector4( vec_sel( column, _mm_set1_ps(1.0f), select_w ) ) + ); +} + +__forceinline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 ) +{ + return Matrix4( + select( mat0.getCol0(), mat1.getCol0(), select1 ), + select( mat0.getCol1(), mat1.getCol1(), select1 ), + select( mat0.getCol2(), mat1.getCol2(), select1 ), + select( mat0.getCol3(), mat1.getCol3(), select1 ) + ); +} + +__forceinline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 ) +{ + return Matrix4( + select( mat0.getCol0(), mat1.getCol0(), select1 ), + select( mat0.getCol1(), mat1.getCol1(), select1 ), + select( mat0.getCol2(), mat1.getCol2(), select1 ), + select( mat0.getCol3(), mat1.getCol3(), select1 ) + ); +} + +#ifdef _VECTORMATH_DEBUG + +__forceinline void print( const Matrix4 & mat ) +{ + print( mat.getRow( 0 ) ); + print( mat.getRow( 1 ) ); + print( mat.getRow( 2 ) ); + print( mat.getRow( 3 ) ); +} + +__forceinline void print( const Matrix4 & mat, const char * name ) +{ + printf("%s:\n", name); + print( mat ); +} + +#endif + +__forceinline Transform3::Transform3( const Transform3 & tfrm ) +{ + mCol0 = tfrm.mCol0; + mCol1 = tfrm.mCol1; + mCol2 = tfrm.mCol2; + mCol3 = tfrm.mCol3; +} + +__forceinline Transform3::Transform3( float scalar ) +{ + mCol0 = Vector3( scalar ); + mCol1 = Vector3( scalar ); + mCol2 = Vector3( scalar ); + mCol3 = Vector3( scalar ); +} + +__forceinline Transform3::Transform3( const floatInVec &scalar ) +{ + mCol0 = Vector3( scalar ); + mCol1 = Vector3( scalar ); + mCol2 = Vector3( scalar ); + mCol3 = Vector3( scalar ); +} + +__forceinline Transform3::Transform3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2, const Vector3 &_col3 ) +{ + mCol0 = _col0; + mCol1 = _col1; + mCol2 = _col2; + mCol3 = _col3; +} + +__forceinline Transform3::Transform3( const Matrix3 & tfrm, const Vector3 &translateVec ) +{ + this->setUpper3x3( tfrm ); + this->setTranslation( translateVec ); +} + +__forceinline Transform3::Transform3( const Quat &unitQuat, const Vector3 &translateVec ) +{ + this->setUpper3x3( Matrix3( unitQuat ) ); + this->setTranslation( translateVec ); +} + +__forceinline Transform3 & Transform3::setCol0( const Vector3 &_col0 ) +{ + mCol0 = _col0; + return *this; +} + +__forceinline Transform3 & Transform3::setCol1( const Vector3 &_col1 ) +{ + mCol1 = _col1; + return *this; +} + +__forceinline Transform3 & Transform3::setCol2( const Vector3 &_col2 ) +{ + mCol2 = _col2; + return *this; +} + +__forceinline Transform3 & Transform3::setCol3( const Vector3 &_col3 ) +{ + mCol3 = _col3; + return *this; +} + +__forceinline Transform3 & Transform3::setCol( int col, const Vector3 &vec ) +{ + *(&mCol0 + col) = vec; + return *this; +} + +__forceinline Transform3 & Transform3::setRow( int row, const Vector4 &vec ) +{ + mCol0.setElem( row, vec.getElem( 0 ) ); + mCol1.setElem( row, vec.getElem( 1 ) ); + mCol2.setElem( row, vec.getElem( 2 ) ); + mCol3.setElem( row, vec.getElem( 3 ) ); + return *this; +} + +__forceinline Transform3 & Transform3::setElem( int col, int row, float val ) +{ + (*this)[col].setElem(row, val); + return *this; +} + +__forceinline Transform3 & Transform3::setElem( int col, int row, const floatInVec &val ) +{ + Vector3 tmpV3_0; + tmpV3_0 = this->getCol( col ); + tmpV3_0.setElem( row, val ); + this->setCol( col, tmpV3_0 ); + return *this; +} + +__forceinline const floatInVec Transform3::getElem( int col, int row ) const +{ + return this->getCol( col ).getElem( row ); +} + +__forceinline const Vector3 Transform3::getCol0( ) const +{ + return mCol0; +} + +__forceinline const Vector3 Transform3::getCol1( ) const +{ + return mCol1; +} + +__forceinline const Vector3 Transform3::getCol2( ) const +{ + return mCol2; +} + +__forceinline const Vector3 Transform3::getCol3( ) const +{ + return mCol3; +} + +__forceinline const Vector3 Transform3::getCol( int col ) const +{ + return *(&mCol0 + col); +} + +__forceinline const Vector4 Transform3::getRow( int row ) const +{ + return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) ); +} + +__forceinline Vector3 & Transform3::operator []( int col ) +{ + return *(&mCol0 + col); +} + +__forceinline const Vector3 Transform3::operator []( int col ) const +{ + return *(&mCol0 + col); +} + +__forceinline Transform3 & Transform3::operator =( const Transform3 & tfrm ) +{ + mCol0 = tfrm.mCol0; + mCol1 = tfrm.mCol1; + mCol2 = tfrm.mCol2; + mCol3 = tfrm.mCol3; + return *this; +} + +__forceinline const Transform3 inverse( const Transform3 & tfrm ) +{ + __m128 inv0, inv1, inv2, inv3; + __m128 tmp0, tmp1, tmp2, tmp3, tmp4, dot, invdet; + __m128 xxxx, yyyy, zzzz; + tmp2 = _vmathVfCross( tfrm.getCol0().get128(), tfrm.getCol1().get128() ); + tmp0 = _vmathVfCross( tfrm.getCol1().get128(), tfrm.getCol2().get128() ); + tmp1 = _vmathVfCross( tfrm.getCol2().get128(), tfrm.getCol0().get128() ); + inv3 = negatef4( tfrm.getCol3().get128() ); + dot = _vmathVfDot3( tmp2, tfrm.getCol2().get128() ); + dot = vec_splat( dot, 0 ); + invdet = recipf4( dot ); + tmp3 = vec_mergeh( tmp0, tmp2 ); + tmp4 = vec_mergel( tmp0, tmp2 ); + inv0 = vec_mergeh( tmp3, tmp1 ); + xxxx = vec_splat( inv3, 0 ); + //inv1 = vec_perm( tmp3, tmp1, _VECTORMATH_PERM_ZBWX ); + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + inv1 = _mm_shuffle_ps( tmp3, tmp3, _MM_SHUFFLE(0,3,2,2)); + inv1 = vec_sel(inv1, tmp1, select_y); + //inv2 = vec_perm( tmp4, tmp1, _VECTORMATH_PERM_XCYX ); + inv2 = _mm_shuffle_ps( tmp4, tmp4, _MM_SHUFFLE(0,1,1,0)); + inv2 = vec_sel(inv2, vec_splat(tmp1, 2), select_y); + yyyy = vec_splat( inv3, 1 ); + zzzz = vec_splat( inv3, 2 ); + inv3 = vec_mul( inv0, xxxx ); + inv3 = vec_madd( inv1, yyyy, inv3 ); + inv3 = vec_madd( inv2, zzzz, inv3 ); + inv0 = vec_mul( inv0, invdet ); + inv1 = vec_mul( inv1, invdet ); + inv2 = vec_mul( inv2, invdet ); + inv3 = vec_mul( inv3, invdet ); + return Transform3( + Vector3( inv0 ), + Vector3( inv1 ), + Vector3( inv2 ), + Vector3( inv3 ) + ); +} + +__forceinline const Transform3 orthoInverse( const Transform3 & tfrm ) +{ + __m128 inv0, inv1, inv2, inv3; + __m128 tmp0, tmp1; + __m128 xxxx, yyyy, zzzz; + tmp0 = vec_mergeh( tfrm.getCol0().get128(), tfrm.getCol2().get128() ); + tmp1 = vec_mergel( tfrm.getCol0().get128(), tfrm.getCol2().get128() ); + inv3 = negatef4( tfrm.getCol3().get128() ); + inv0 = vec_mergeh( tmp0, tfrm.getCol1().get128() ); + xxxx = vec_splat( inv3, 0 ); + //inv1 = vec_perm( tmp0, tfrm.getCol1().get128(), _VECTORMATH_PERM_ZBWX ); + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + inv1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2)); + inv1 = vec_sel(inv1, tfrm.getCol1().get128(), select_y); + //inv2 = vec_perm( tmp1, tfrm.getCol1().get128(), _VECTORMATH_PERM_XCYX ); + inv2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0)); + inv2 = vec_sel(inv2, vec_splat(tfrm.getCol1().get128(), 2), select_y); + yyyy = vec_splat( inv3, 1 ); + zzzz = vec_splat( inv3, 2 ); + inv3 = vec_mul( inv0, xxxx ); + inv3 = vec_madd( inv1, yyyy, inv3 ); + inv3 = vec_madd( inv2, zzzz, inv3 ); + return Transform3( + Vector3( inv0 ), + Vector3( inv1 ), + Vector3( inv2 ), + Vector3( inv3 ) + ); +} + +__forceinline const Transform3 absPerElem( const Transform3 & tfrm ) +{ + return Transform3( + absPerElem( tfrm.getCol0() ), + absPerElem( tfrm.getCol1() ), + absPerElem( tfrm.getCol2() ), + absPerElem( tfrm.getCol3() ) + ); +} + +__forceinline const Vector3 Transform3::operator *( const Vector3 &vec ) const +{ + __m128 res; + __m128 xxxx, yyyy, zzzz; + xxxx = vec_splat( vec.get128(), 0 ); + yyyy = vec_splat( vec.get128(), 1 ); + zzzz = vec_splat( vec.get128(), 2 ); + res = vec_mul( mCol0.get128(), xxxx ); + res = vec_madd( mCol1.get128(), yyyy, res ); + res = vec_madd( mCol2.get128(), zzzz, res ); + return Vector3( res ); +} + +__forceinline const Point3 Transform3::operator *( const Point3 &pnt ) const +{ + __m128 tmp0, tmp1, res; + __m128 xxxx, yyyy, zzzz; + xxxx = vec_splat( pnt.get128(), 0 ); + yyyy = vec_splat( pnt.get128(), 1 ); + zzzz = vec_splat( pnt.get128(), 2 ); + tmp0 = vec_mul( mCol0.get128(), xxxx ); + tmp1 = vec_mul( mCol1.get128(), yyyy ); + tmp0 = vec_madd( mCol2.get128(), zzzz, tmp0 ); + tmp1 = vec_add( mCol3.get128(), tmp1 ); + res = vec_add( tmp0, tmp1 ); + return Point3( res ); +} + +__forceinline const Transform3 Transform3::operator *( const Transform3 & tfrm ) const +{ + return Transform3( + ( *this * tfrm.mCol0 ), + ( *this * tfrm.mCol1 ), + ( *this * tfrm.mCol2 ), + Vector3( ( *this * Point3( tfrm.mCol3 ) ) ) + ); +} + +__forceinline Transform3 & Transform3::operator *=( const Transform3 & tfrm ) +{ + *this = *this * tfrm; + return *this; +} + +__forceinline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 ) +{ + return Transform3( + mulPerElem( tfrm0.getCol0(), tfrm1.getCol0() ), + mulPerElem( tfrm0.getCol1(), tfrm1.getCol1() ), + mulPerElem( tfrm0.getCol2(), tfrm1.getCol2() ), + mulPerElem( tfrm0.getCol3(), tfrm1.getCol3() ) + ); +} + +__forceinline const Transform3 Transform3::identity( ) +{ + return Transform3( + Vector3::xAxis( ), + Vector3::yAxis( ), + Vector3::zAxis( ), + Vector3( 0.0f ) + ); +} + +__forceinline Transform3 & Transform3::setUpper3x3( const Matrix3 & tfrm ) +{ + mCol0 = tfrm.getCol0(); + mCol1 = tfrm.getCol1(); + mCol2 = tfrm.getCol2(); + return *this; +} + +__forceinline const Matrix3 Transform3::getUpper3x3( ) const +{ + return Matrix3( mCol0, mCol1, mCol2 ); +} + +__forceinline Transform3 & Transform3::setTranslation( const Vector3 &translateVec ) +{ + mCol3 = translateVec; + return *this; +} + +__forceinline const Vector3 Transform3::getTranslation( ) const +{ + return mCol3; +} + +__forceinline const Transform3 Transform3::rotationX( float radians ) +{ + return rotationX( floatInVec(radians) ); +} + +__forceinline const Transform3 Transform3::rotationX( const floatInVec &radians ) +{ + __m128 s, c, res1, res2; + __m128 zero; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + zero = _mm_setzero_ps(); + sincosf4( radians.get128(), &s, &c ); + res1 = vec_sel( zero, c, select_y ); + res1 = vec_sel( res1, s, select_z ); + res2 = vec_sel( zero, negatef4(s), select_y ); + res2 = vec_sel( res2, c, select_z ); + return Transform3( + Vector3::xAxis( ), + Vector3( res1 ), + Vector3( res2 ), + Vector3( _mm_setzero_ps() ) + ); +} + +__forceinline const Transform3 Transform3::rotationY( float radians ) +{ + return rotationY( floatInVec(radians) ); +} + +__forceinline const Transform3 Transform3::rotationY( const floatInVec &radians ) +{ + __m128 s, c, res0, res2; + __m128 zero; + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + zero = _mm_setzero_ps(); + sincosf4( radians.get128(), &s, &c ); + res0 = vec_sel( zero, c, select_x ); + res0 = vec_sel( res0, negatef4(s), select_z ); + res2 = vec_sel( zero, s, select_x ); + res2 = vec_sel( res2, c, select_z ); + return Transform3( + Vector3( res0 ), + Vector3::yAxis( ), + Vector3( res2 ), + Vector3( 0.0f ) + ); +} + +__forceinline const Transform3 Transform3::rotationZ( float radians ) +{ + return rotationZ( floatInVec(radians) ); +} + +__forceinline const Transform3 Transform3::rotationZ( const floatInVec &radians ) +{ + __m128 s, c, res0, res1; + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + __m128 zero = _mm_setzero_ps(); + sincosf4( radians.get128(), &s, &c ); + res0 = vec_sel( zero, c, select_x ); + res0 = vec_sel( res0, s, select_y ); + res1 = vec_sel( zero, negatef4(s), select_x ); + res1 = vec_sel( res1, c, select_y ); + return Transform3( + Vector3( res0 ), + Vector3( res1 ), + Vector3::zAxis( ), + Vector3( 0.0f ) + ); +} + +__forceinline const Transform3 Transform3::rotationZYX( const Vector3 &radiansXYZ ) +{ + __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp; + angles = Vector4( radiansXYZ, 0.0f ).get128(); + sincosf4( angles, &s, &c ); + negS = negatef4( s ); + Z0 = vec_mergel( c, s ); + Z1 = vec_mergel( negS, c ); + __declspec(align(16)) unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0}; + Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) ); + Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) ); + Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) ); + X0 = vec_splat( s, 0 ); + X1 = vec_splat( c, 0 ); + tmp = vec_mul( Z0, Y1 ); + return Transform3( + Vector3( vec_mul( Z0, Y0 ) ), + Vector3( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ), + Vector3( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) ), + Vector3( 0.0f ) + ); +} + +__forceinline const Transform3 Transform3::rotation( float radians, const Vector3 &unitVec ) +{ + return rotation( floatInVec(radians), unitVec ); +} + +__forceinline const Transform3 Transform3::rotation( const floatInVec &radians, const Vector3 &unitVec ) +{ + return Transform3( Matrix3::rotation( radians, unitVec ), Vector3( 0.0f ) ); +} + +__forceinline const Transform3 Transform3::rotation( const Quat &unitQuat ) +{ + return Transform3( Matrix3( unitQuat ), Vector3( 0.0f ) ); +} + +__forceinline const Transform3 Transform3::scale( const Vector3 &scaleVec ) +{ + __m128 zero = _mm_setzero_ps(); + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + return Transform3( + Vector3( vec_sel( zero, scaleVec.get128(), select_x ) ), + Vector3( vec_sel( zero, scaleVec.get128(), select_y ) ), + Vector3( vec_sel( zero, scaleVec.get128(), select_z ) ), + Vector3( 0.0f ) + ); +} + +__forceinline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec ) +{ + return Transform3( + ( tfrm.getCol0() * scaleVec.getX( ) ), + ( tfrm.getCol1() * scaleVec.getY( ) ), + ( tfrm.getCol2() * scaleVec.getZ( ) ), + tfrm.getCol3() + ); +} + +__forceinline const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm ) +{ + return Transform3( + mulPerElem( tfrm.getCol0(), scaleVec ), + mulPerElem( tfrm.getCol1(), scaleVec ), + mulPerElem( tfrm.getCol2(), scaleVec ), + mulPerElem( tfrm.getCol3(), scaleVec ) + ); +} + +__forceinline const Transform3 Transform3::translation( const Vector3 &translateVec ) +{ + return Transform3( + Vector3::xAxis( ), + Vector3::yAxis( ), + Vector3::zAxis( ), + translateVec + ); +} + +__forceinline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 ) +{ + return Transform3( + select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ), + select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ), + select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ), + select( tfrm0.getCol3(), tfrm1.getCol3(), select1 ) + ); +} + +__forceinline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 ) +{ + return Transform3( + select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ), + select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ), + select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ), + select( tfrm0.getCol3(), tfrm1.getCol3(), select1 ) + ); +} + +#ifdef _VECTORMATH_DEBUG + +__forceinline void print( const Transform3 & tfrm ) +{ + print( tfrm.getRow( 0 ) ); + print( tfrm.getRow( 1 ) ); + print( tfrm.getRow( 2 ) ); +} + +__forceinline void print( const Transform3 & tfrm, const char * name ) +{ + printf("%s:\n", name); + print( tfrm ); +} + +#endif + +__forceinline Quat::Quat( const Matrix3 & tfrm ) +{ + __m128 res; + __m128 col0, col1, col2; + __m128 xx_yy, xx_yy_zz_xx, yy_zz_xx_yy, zz_xx_yy_zz, diagSum, diagDiff; + __m128 zy_xz_yx, yz_zx_xy, sum, diff; + __m128 radicand, invSqrt, scale; + __m128 res0, res1, res2, res3; + __m128 xx, yy, zz; + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + __declspec(align(16)) unsigned int select_w[4] = {0, 0, 0, 0xffffffff}; + + col0 = tfrm.getCol0().get128(); + col1 = tfrm.getCol1().get128(); + col2 = tfrm.getCol2().get128(); + + /* four cases: */ + /* trace > 0 */ + /* else */ + /* xx largest diagonal element */ + /* yy largest diagonal element */ + /* zz largest diagonal element */ + + /* compute quaternion for each case */ + + xx_yy = vec_sel( col0, col1, select_y ); + //xx_yy_zz_xx = vec_perm( xx_yy, col2, _VECTORMATH_PERM_XYCX ); + //yy_zz_xx_yy = vec_perm( xx_yy, col2, _VECTORMATH_PERM_YCXY ); + //zz_xx_yy_zz = vec_perm( xx_yy, col2, _VECTORMATH_PERM_CXYC ); + xx_yy_zz_xx = _mm_shuffle_ps( xx_yy, xx_yy, _MM_SHUFFLE(0,0,1,0) ); + xx_yy_zz_xx = vec_sel( xx_yy_zz_xx, col2, select_z ); // TODO: Ck + yy_zz_xx_yy = _mm_shuffle_ps( xx_yy_zz_xx, xx_yy_zz_xx, _MM_SHUFFLE(1,0,2,1) ); + zz_xx_yy_zz = _mm_shuffle_ps( xx_yy_zz_xx, xx_yy_zz_xx, _MM_SHUFFLE(2,1,0,2) ); + + diagSum = vec_add( vec_add( xx_yy_zz_xx, yy_zz_xx_yy ), zz_xx_yy_zz ); + diagDiff = vec_sub( vec_sub( xx_yy_zz_xx, yy_zz_xx_yy ), zz_xx_yy_zz ); + radicand = vec_add( vec_sel( diagDiff, diagSum, select_w ), _mm_set1_ps(1.0f) ); + // invSqrt = rsqrtf4( radicand ); + invSqrt = newtonrapson_rsqrt4( radicand ); + + + + zy_xz_yx = vec_sel( col0, col1, select_z ); // zy_xz_yx = 00 01 12 03 + //zy_xz_yx = vec_perm( zy_xz_yx, col2, _VECTORMATH_PERM_ZAYX ); + zy_xz_yx = _mm_shuffle_ps( zy_xz_yx, zy_xz_yx, _MM_SHUFFLE(0,1,2,2) ); // zy_xz_yx = 12 12 01 00 + zy_xz_yx = vec_sel( zy_xz_yx, vec_splat(col2, 0), select_y ); // zy_xz_yx = 12 20 01 00 + yz_zx_xy = vec_sel( col0, col1, select_x ); // yz_zx_xy = 10 01 02 03 + //yz_zx_xy = vec_perm( yz_zx_xy, col2, _VECTORMATH_PERM_BZXX ); + yz_zx_xy = _mm_shuffle_ps( yz_zx_xy, yz_zx_xy, _MM_SHUFFLE(0,0,2,0) ); // yz_zx_xy = 10 02 10 10 + yz_zx_xy = vec_sel( yz_zx_xy, vec_splat(col2, 1), select_x ); // yz_zx_xy = 21 02 10 10 + + sum = vec_add( zy_xz_yx, yz_zx_xy ); + diff = vec_sub( zy_xz_yx, yz_zx_xy ); + + scale = vec_mul( invSqrt, _mm_set1_ps(0.5f) ); + + //res0 = vec_perm( sum, diff, _VECTORMATH_PERM_XZYA ); + res0 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,1,2,0) ); + res0 = vec_sel( res0, vec_splat(diff, 0), select_w ); // TODO: Ck + //res1 = vec_perm( sum, diff, _VECTORMATH_PERM_ZXXB ); + res1 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,0,0,2) ); + res1 = vec_sel( res1, vec_splat(diff, 1), select_w ); // TODO: Ck + //res2 = vec_perm( sum, diff, _VECTORMATH_PERM_YXXC ); + res2 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,0,0,1) ); + res2 = vec_sel( res2, vec_splat(diff, 2), select_w ); // TODO: Ck + res3 = diff; + res0 = vec_sel( res0, radicand, select_x ); + res1 = vec_sel( res1, radicand, select_y ); + res2 = vec_sel( res2, radicand, select_z ); + res3 = vec_sel( res3, radicand, select_w ); + res0 = vec_mul( res0, vec_splat( scale, 0 ) ); + res1 = vec_mul( res1, vec_splat( scale, 1 ) ); + res2 = vec_mul( res2, vec_splat( scale, 2 ) ); + res3 = vec_mul( res3, vec_splat( scale, 3 ) ); + + /* determine case and select answer */ + + xx = vec_splat( col0, 0 ); + yy = vec_splat( col1, 1 ); + zz = vec_splat( col2, 2 ); + res = vec_sel( res0, res1, vec_cmpgt( yy, xx ) ); + res = vec_sel( res, res2, vec_and( vec_cmpgt( zz, xx ), vec_cmpgt( zz, yy ) ) ); + res = vec_sel( res, res3, vec_cmpgt( vec_splat( diagSum, 0 ), _mm_setzero_ps() ) ); + mVec128 = res; +} + +__forceinline const Matrix3 outer( const Vector3 &tfrm0, const Vector3 &tfrm1 ) +{ + return Matrix3( + ( tfrm0 * tfrm1.getX( ) ), + ( tfrm0 * tfrm1.getY( ) ), + ( tfrm0 * tfrm1.getZ( ) ) + ); +} + +__forceinline const Matrix4 outer( const Vector4 &tfrm0, const Vector4 &tfrm1 ) +{ + return Matrix4( + ( tfrm0 * tfrm1.getX( ) ), + ( tfrm0 * tfrm1.getY( ) ), + ( tfrm0 * tfrm1.getZ( ) ), + ( tfrm0 * tfrm1.getW( ) ) + ); +} + +__forceinline const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat ) +{ + __m128 tmp0, tmp1, mcol0, mcol1, mcol2, res; + __m128 xxxx, yyyy, zzzz; + tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() ); + tmp1 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() ); + xxxx = vec_splat( vec.get128(), 0 ); + mcol0 = vec_mergeh( tmp0, mat.getCol1().get128() ); + //mcol1 = vec_perm( tmp0, mat.getCol1().get128(), _VECTORMATH_PERM_ZBWX ); + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + mcol1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2)); + mcol1 = vec_sel(mcol1, mat.getCol1().get128(), select_y); + //mcol2 = vec_perm( tmp1, mat.getCol1().get128(), _VECTORMATH_PERM_XCYX ); + mcol2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0)); + mcol2 = vec_sel(mcol2, vec_splat(mat.getCol1().get128(), 2), select_y); + yyyy = vec_splat( vec.get128(), 1 ); + res = vec_mul( mcol0, xxxx ); + zzzz = vec_splat( vec.get128(), 2 ); + res = vec_madd( mcol1, yyyy, res ); + res = vec_madd( mcol2, zzzz, res ); + return Vector3( res ); +} + +__forceinline const Matrix3 crossMatrix( const Vector3 &vec ) +{ + __m128 neg, res0, res1, res2; + neg = negatef4( vec.get128() ); + __declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0}; + __declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0}; + //res0 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_XZBX ); + res0 = _mm_shuffle_ps( vec.get128(), vec.get128(), _MM_SHUFFLE(0,2,2,0) ); + res0 = vec_sel(res0, vec_splat(neg, 1), select_z); + //res1 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_CXXX ); + res1 = vec_sel(vec_splat(vec.get128(), 0), vec_splat(neg, 2), select_x); + //res2 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_YAXX ); + res2 = _mm_shuffle_ps( vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,1,1) ); + res2 = vec_sel(res2, vec_splat(neg, 0), select_y); + __declspec(align(16)) unsigned int filter_x[4] = {0, 0xffffffff, 0xffffffff, 0xffffffff}; + __declspec(align(16)) unsigned int filter_y[4] = {0xffffffff, 0, 0xffffffff, 0xffffffff}; + __declspec(align(16)) unsigned int filter_z[4] = {0xffffffff, 0xffffffff, 0, 0xffffffff}; + res0 = vec_and( res0, _mm_load_ps((float *)filter_x ) ); + res1 = vec_and( res1, _mm_load_ps((float *)filter_y ) ); + res2 = vec_and( res2, _mm_load_ps((float *)filter_z ) ); // TODO: Use selects? + return Matrix3( + Vector3( res0 ), + Vector3( res1 ), + Vector3( res2 ) + ); +} + +__forceinline const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat ) +{ + return Matrix3( cross( vec, mat.getCol0() ), cross( vec, mat.getCol1() ), cross( vec, mat.getCol2() ) ); +} + +} // namespace Aos +} // namespace Vectormath + +#endif diff --git a/src/BulletMultiThreaded/vectormath/sse/quat_aos.h b/src/BulletMultiThreaded/vectormath/sse/quat_aos.h new file mode 100644 index 000000000..3d4822142 --- /dev/null +++ b/src/BulletMultiThreaded/vectormath/sse/quat_aos.h @@ -0,0 +1,579 @@ +/* + Copyright (C) 2006, 2007 Sony Computer Entertainment Inc. + All rights reserved. + + Redistribution and use in source and binary forms, + with or without modification, are permitted provided that the + following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the Sony Computer Entertainment Inc nor the names + of its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + + +#ifndef _VECTORMATH_QUAT_AOS_CPP_H +#define _VECTORMATH_QUAT_AOS_CPP_H + +//----------------------------------------------------------------------------- +// Definitions + +#ifndef _VECTORMATH_INTERNAL_FUNCTIONS +#define _VECTORMATH_INTERNAL_FUNCTIONS + +#endif + +namespace Vectormath { +namespace Aos { + +__forceinline void Quat::set128(vec_float4 vec) +{ + mVec128 = vec; +} + +__forceinline Quat::Quat( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w ) +{ + mVec128 = _mm_unpacklo_ps( + _mm_unpacklo_ps( _x.get128(), _z.get128() ), + _mm_unpacklo_ps( _y.get128(), _w.get128() ) ); +} + +__forceinline Quat::Quat( const Vector3 &xyz, float _w ) +{ + mVec128 = xyz.get128(); + _vmathVfSetElement(mVec128, _w, 3); +} + + + +__forceinline Quat::Quat(const Quat& quat) +{ + mVec128 = quat.get128(); +} + +__forceinline Quat::Quat( float _x, float _y, float _z, float _w ) +{ + mVec128 = _mm_setr_ps(_x, _y, _z, _w); +} + + + + + +__forceinline Quat::Quat( const Vector3 &xyz, const floatInVec &_w ) +{ + mVec128 = xyz.get128(); + mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3); +} + +__forceinline Quat::Quat( const Vector4 &vec ) +{ + mVec128 = vec.get128(); +} + +__forceinline Quat::Quat( float scalar ) +{ + mVec128 = floatInVec(scalar).get128(); +} + +__forceinline Quat::Quat( const floatInVec &scalar ) +{ + mVec128 = scalar.get128(); +} + +__forceinline Quat::Quat( __m128 vf4 ) +{ + mVec128 = vf4; +} + +__forceinline const Quat Quat::identity( ) +{ + return Quat( _VECTORMATH_UNIT_0001 ); +} + +__forceinline const Quat lerp( float t, const Quat &quat0, const Quat &quat1 ) +{ + return lerp( floatInVec(t), quat0, quat1 ); +} + +__forceinline const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 ) +{ + return ( quat0 + ( ( quat1 - quat0 ) * t ) ); +} + +__forceinline const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 ) +{ + return slerp( floatInVec(t), unitQuat0, unitQuat1 ); +} + +__forceinline const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 ) +{ + Quat start; + vec_float4 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines; + __m128 selectMask; + cosAngle = _vmathVfDot4( unitQuat0.get128(), unitQuat1.get128() ); + selectMask = (__m128)vec_cmpgt( _mm_setzero_ps(), cosAngle ); + cosAngle = vec_sel( cosAngle, negatef4( cosAngle ), selectMask ); + start = Quat( vec_sel( unitQuat0.get128(), negatef4( unitQuat0.get128() ), selectMask ) ); + selectMask = (__m128)vec_cmpgt( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle ); + angle = acosf4( cosAngle ); + tttt = t.get128(); + oneMinusT = vec_sub( _mm_set1_ps(1.0f), tttt ); + angles = vec_mergeh( _mm_set1_ps(1.0f), tttt ); + angles = vec_mergeh( angles, oneMinusT ); + angles = vec_madd( angles, angle, _mm_setzero_ps() ); + sines = sinf4( angles ); + scales = _mm_div_ps( sines, vec_splat( sines, 0 ) ); + scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask ); + scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask ); + return Quat( vec_madd( start.get128(), scale0, vec_mul( unitQuat1.get128(), scale1 ) ) ); +} + +__forceinline const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 ) +{ + return squad( floatInVec(t), unitQuat0, unitQuat1, unitQuat2, unitQuat3 ); +} + +__forceinline const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 ) +{ + return slerp( ( ( floatInVec(2.0f) * t ) * ( floatInVec(1.0f) - t ) ), slerp( t, unitQuat0, unitQuat3 ), slerp( t, unitQuat1, unitQuat2 ) ); +} + +__forceinline __m128 Quat::get128( ) const +{ + return mVec128; +} + +__forceinline Quat & Quat::operator =( const Quat &quat ) +{ + mVec128 = quat.mVec128; + return *this; +} + +__forceinline Quat & Quat::setXYZ( const Vector3 &vec ) +{ + __declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff}; + mVec128 = vec_sel( vec.get128(), mVec128, sw ); + return *this; +} + +__forceinline const Vector3 Quat::getXYZ( ) const +{ + return Vector3( mVec128 ); +} + +__forceinline Quat & Quat::setX( float _x ) +{ + _vmathVfSetElement(mVec128, _x, 0); + return *this; +} + +__forceinline Quat & Quat::setX( const floatInVec &_x ) +{ + mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0); + return *this; +} + +__forceinline const floatInVec Quat::getX( ) const +{ + return floatInVec( mVec128, 0 ); +} + +__forceinline Quat & Quat::setY( float _y ) +{ + _vmathVfSetElement(mVec128, _y, 1); + return *this; +} + +__forceinline Quat & Quat::setY( const floatInVec &_y ) +{ + mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1); + return *this; +} + +__forceinline const floatInVec Quat::getY( ) const +{ + return floatInVec( mVec128, 1 ); +} + +__forceinline Quat & Quat::setZ( float _z ) +{ + _vmathVfSetElement(mVec128, _z, 2); + return *this; +} + +__forceinline Quat & Quat::setZ( const floatInVec &_z ) +{ + mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2); + return *this; +} + +__forceinline const floatInVec Quat::getZ( ) const +{ + return floatInVec( mVec128, 2 ); +} + +__forceinline Quat & Quat::setW( float _w ) +{ + _vmathVfSetElement(mVec128, _w, 3); + return *this; +} + +__forceinline Quat & Quat::setW( const floatInVec &_w ) +{ + mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3); + return *this; +} + +__forceinline const floatInVec Quat::getW( ) const +{ + return floatInVec( mVec128, 3 ); +} + +__forceinline Quat & Quat::setElem( int idx, float value ) +{ + _vmathVfSetElement(mVec128, value, idx); + return *this; +} + +__forceinline Quat & Quat::setElem( int idx, const floatInVec &value ) +{ + mVec128 = _vmathVfInsert(mVec128, value.get128(), idx); + return *this; +} + +__forceinline const floatInVec Quat::getElem( int idx ) const +{ + return floatInVec( mVec128, idx ); +} + +__forceinline VecIdx Quat::operator []( int idx ) +{ + return VecIdx( mVec128, idx ); +} + +__forceinline const floatInVec Quat::operator []( int idx ) const +{ + return floatInVec( mVec128, idx ); +} + +__forceinline const Quat Quat::operator +( const Quat &quat ) const +{ + return Quat( _mm_add_ps( mVec128, quat.mVec128 ) ); +} + + +__forceinline const Quat Quat::operator -( const Quat &quat ) const +{ + return Quat( _mm_sub_ps( mVec128, quat.mVec128 ) ); +} + +__forceinline const Quat Quat::operator *( float scalar ) const +{ + return *this * floatInVec(scalar); +} + +__forceinline const Quat Quat::operator *( const floatInVec &scalar ) const +{ + return Quat( _mm_mul_ps( mVec128, scalar.get128() ) ); +} + +__forceinline Quat & Quat::operator +=( const Quat &quat ) +{ + *this = *this + quat; + return *this; +} + +__forceinline Quat & Quat::operator -=( const Quat &quat ) +{ + *this = *this - quat; + return *this; +} + +__forceinline Quat & Quat::operator *=( float scalar ) +{ + *this = *this * scalar; + return *this; +} + +__forceinline Quat & Quat::operator *=( const floatInVec &scalar ) +{ + *this = *this * scalar; + return *this; +} + +__forceinline const Quat Quat::operator /( float scalar ) const +{ + return *this / floatInVec(scalar); +} + +__forceinline const Quat Quat::operator /( const floatInVec &scalar ) const +{ + return Quat( _mm_div_ps( mVec128, scalar.get128() ) ); +} + +__forceinline Quat & Quat::operator /=( float scalar ) +{ + *this = *this / scalar; + return *this; +} + +__forceinline Quat & Quat::operator /=( const floatInVec &scalar ) +{ + *this = *this / scalar; + return *this; +} + +__forceinline const Quat Quat::operator -( ) const +{ + return Quat(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) ); +} + +__forceinline const Quat operator *( float scalar, const Quat &quat ) +{ + return floatInVec(scalar) * quat; +} + +__forceinline const Quat operator *( const floatInVec &scalar, const Quat &quat ) +{ + return quat * scalar; +} + +__forceinline const floatInVec dot( const Quat &quat0, const Quat &quat1 ) +{ + return floatInVec( _vmathVfDot4( quat0.get128(), quat1.get128() ), 0 ); +} + +__forceinline const floatInVec norm( const Quat &quat ) +{ + return floatInVec( _vmathVfDot4( quat.get128(), quat.get128() ), 0 ); +} + +__forceinline const floatInVec length( const Quat &quat ) +{ + return floatInVec( _mm_sqrt_ps(_vmathVfDot4( quat.get128(), quat.get128() )), 0 ); +} + +__forceinline const Quat normalize( const Quat &quat ) +{ + vec_float4 dot =_vmathVfDot4( quat.get128(), quat.get128()); + return Quat( _mm_mul_ps( quat.get128(), newtonrapson_rsqrt4( dot ) ) ); +} + + +__forceinline const Quat Quat::rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 ) +{ + Vector3 crossVec; + __m128 cosAngle, cosAngleX2Plus2, recipCosHalfAngleX2, cosHalfAngleX2, res; + cosAngle = _vmathVfDot3( unitVec0.get128(), unitVec1.get128() ); + cosAngleX2Plus2 = vec_madd( cosAngle, _mm_set1_ps(2.0f), _mm_set1_ps(2.0f) ); + recipCosHalfAngleX2 = _mm_rsqrt_ps( cosAngleX2Plus2 ); + cosHalfAngleX2 = vec_mul( recipCosHalfAngleX2, cosAngleX2Plus2 ); + crossVec = cross( unitVec0, unitVec1 ); + res = vec_mul( crossVec.get128(), recipCosHalfAngleX2 ); + __declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff}; + res = vec_sel( res, vec_mul( cosHalfAngleX2, _mm_set1_ps(0.5f) ), sw ); + return Quat( res ); +} + +__forceinline const Quat Quat::rotation( float radians, const Vector3 &unitVec ) +{ + return rotation( floatInVec(radians), unitVec ); +} + +__forceinline const Quat Quat::rotation( const floatInVec &radians, const Vector3 &unitVec ) +{ + __m128 s, c, angle, res; + angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) ); + sincosf4( angle, &s, &c ); + __declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff}; + res = vec_sel( vec_mul( unitVec.get128(), s ), c, sw ); + return Quat( res ); +} + +__forceinline const Quat Quat::rotationX( float radians ) +{ + return rotationX( floatInVec(radians) ); +} + +__forceinline const Quat Quat::rotationX( const floatInVec &radians ) +{ + __m128 s, c, angle, res; + angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) ); + sincosf4( angle, &s, &c ); + __declspec(align(16)) unsigned int xsw[4] = {0xffffffff, 0, 0, 0}; + __declspec(align(16)) unsigned int wsw[4] = {0, 0, 0, 0xffffffff}; + res = vec_sel( _mm_setzero_ps(), s, xsw ); + res = vec_sel( res, c, wsw ); + return Quat( res ); +} + +__forceinline const Quat Quat::rotationY( float radians ) +{ + return rotationY( floatInVec(radians) ); +} + +__forceinline const Quat Quat::rotationY( const floatInVec &radians ) +{ + __m128 s, c, angle, res; + angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) ); + sincosf4( angle, &s, &c ); + __declspec(align(16)) unsigned int ysw[4] = {0, 0xffffffff, 0, 0}; + __declspec(align(16)) unsigned int wsw[4] = {0, 0, 0, 0xffffffff}; + res = vec_sel( _mm_setzero_ps(), s, ysw ); + res = vec_sel( res, c, wsw ); + return Quat( res ); +} + +__forceinline const Quat Quat::rotationZ( float radians ) +{ + return rotationZ( floatInVec(radians) ); +} + +__forceinline const Quat Quat::rotationZ( const floatInVec &radians ) +{ + __m128 s, c, angle, res; + angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) ); + sincosf4( angle, &s, &c ); + __declspec(align(16)) unsigned int zsw[4] = {0, 0, 0xffffffff, 0}; + __declspec(align(16)) unsigned int wsw[4] = {0, 0, 0, 0xffffffff}; + res = vec_sel( _mm_setzero_ps(), s, zsw ); + res = vec_sel( res, c, wsw ); + return Quat( res ); +} + +__forceinline const Quat Quat::operator *( const Quat &quat ) const +{ + __m128 ldata, rdata, qv, tmp0, tmp1, tmp2, tmp3; + __m128 product, l_wxyz, r_wxyz, xy, qw; + ldata = mVec128; + rdata = quat.mVec128; + tmp0 = _mm_shuffle_ps( ldata, ldata, _MM_SHUFFLE(3,0,2,1) ); + tmp1 = _mm_shuffle_ps( rdata, rdata, _MM_SHUFFLE(3,1,0,2) ); + tmp2 = _mm_shuffle_ps( ldata, ldata, _MM_SHUFFLE(3,1,0,2) ); + tmp3 = _mm_shuffle_ps( rdata, rdata, _MM_SHUFFLE(3,0,2,1) ); + qv = vec_mul( vec_splat( ldata, 3 ), rdata ); + qv = vec_madd( vec_splat( rdata, 3 ), ldata, qv ); + qv = vec_madd( tmp0, tmp1, qv ); + qv = vec_nmsub( tmp2, tmp3, qv ); + product = vec_mul( ldata, rdata ); + l_wxyz = vec_sld( ldata, ldata, 12 ); + r_wxyz = vec_sld( rdata, rdata, 12 ); + qw = vec_nmsub( l_wxyz, r_wxyz, product ); + xy = vec_madd( l_wxyz, r_wxyz, product ); + qw = vec_sub( qw, vec_sld( xy, xy, 8 ) ); + __declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff}; + return Quat( vec_sel( qv, qw, sw ) ); +} + +__forceinline Quat & Quat::operator *=( const Quat &quat ) +{ + *this = *this * quat; + return *this; +} + +__forceinline const Vector3 rotate( const Quat &quat, const Vector3 &vec ) +{ __m128 qdata, vdata, product, tmp0, tmp1, tmp2, tmp3, wwww, qv, qw, res; + qdata = quat.get128(); + vdata = vec.get128(); + tmp0 = _mm_shuffle_ps( qdata, qdata, _MM_SHUFFLE(3,0,2,1) ); + tmp1 = _mm_shuffle_ps( vdata, vdata, _MM_SHUFFLE(3,1,0,2) ); + tmp2 = _mm_shuffle_ps( qdata, qdata, _MM_SHUFFLE(3,1,0,2) ); + tmp3 = _mm_shuffle_ps( vdata, vdata, _MM_SHUFFLE(3,0,2,1) ); + wwww = vec_splat( qdata, 3 ); + qv = vec_mul( wwww, vdata ); + qv = vec_madd( tmp0, tmp1, qv ); + qv = vec_nmsub( tmp2, tmp3, qv ); + product = vec_mul( qdata, vdata ); + qw = vec_madd( vec_sld( qdata, qdata, 4 ), vec_sld( vdata, vdata, 4 ), product ); + qw = vec_add( vec_sld( product, product, 8 ), qw ); + tmp1 = _mm_shuffle_ps( qv, qv, _MM_SHUFFLE(3,1,0,2) ); + tmp3 = _mm_shuffle_ps( qv, qv, _MM_SHUFFLE(3,0,2,1) ); + res = vec_mul( vec_splat( qw, 0 ), qdata ); + res = vec_madd( wwww, qv, res ); + res = vec_madd( tmp0, tmp1, res ); + res = vec_nmsub( tmp2, tmp3, res ); + return Vector3( res ); +} + +__forceinline const Quat conj( const Quat &quat ) +{ + __declspec(align(16)) unsigned int sw[4] = {0x80000000,0x80000000,0x80000000,0}; + return Quat( vec_xor( quat.get128(), _mm_load_ps((float *)sw) ) ); +} + +__forceinline const Quat select( const Quat &quat0, const Quat &quat1, bool select1 ) +{ + return select( quat0, quat1, boolInVec(select1) ); +} + +//__forceinline const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 ) +//{ +// return Quat( vec_sel( quat0.get128(), quat1.get128(), select1.get128() ) ); +//} + +__forceinline void loadXYZW(Quat& quat, const float* fptr) +{ +#ifdef USE_SSE2_LDDQU + quat = Quat( SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128 ); +#else + SSEFloat fl; + fl.f[0] = fptr[0]; + fl.f[1] = fptr[1]; + fl.f[2] = fptr[2]; + fl.f[3] = fptr[3]; + quat = Quat( fl.m128); +#endif + + +} + +__forceinline void storeXYZW(const Quat& quat, float* fptr) +{ + fptr[0] = quat.getX(); + fptr[1] = quat.getY(); + fptr[2] = quat.getZ(); + fptr[3] = quat.getW(); +// _mm_storeu_ps((float*)quat.get128(),fptr); +} + + + +#ifdef _VECTORMATH_DEBUG + +__forceinline void print( const Quat &quat ) +{ + union { __m128 v; float s[4]; } tmp; + tmp.v = quat.get128(); + printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] ); +} + +__forceinline void print( const Quat &quat, const char * name ) +{ + union { __m128 v; float s[4]; } tmp; + tmp.v = quat.get128(); + printf( "%s: ( %f %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] ); +} + +#endif + +} // namespace Aos +} // namespace Vectormath + +#endif diff --git a/src/BulletMultiThreaded/vectormath/sse/vec_aos.h b/src/BulletMultiThreaded/vectormath/sse/vec_aos.h new file mode 100644 index 000000000..1a6e02eae --- /dev/null +++ b/src/BulletMultiThreaded/vectormath/sse/vec_aos.h @@ -0,0 +1,1431 @@ +/* + Copyright (C) 2006, 2007 Sony Computer Entertainment Inc. + All rights reserved. + + Redistribution and use in source and binary forms, + with or without modification, are permitted provided that the + following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the Sony Computer Entertainment Inc nor the names + of its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _VECTORMATH_VEC_AOS_CPP_H +#define _VECTORMATH_VEC_AOS_CPP_H + +//----------------------------------------------------------------------------- +// Constants +// for permutes words are labeled [x,y,z,w] [a,b,c,d] + +#define _VECTORMATH_PERM_X 0x00010203 +#define _VECTORMATH_PERM_Y 0x04050607 +#define _VECTORMATH_PERM_Z 0x08090a0b +#define _VECTORMATH_PERM_W 0x0c0d0e0f +#define _VECTORMATH_PERM_A 0x10111213 +#define _VECTORMATH_PERM_B 0x14151617 +#define _VECTORMATH_PERM_C 0x18191a1b +#define _VECTORMATH_PERM_D 0x1c1d1e1f +#define _VECTORMATH_PERM_XYZA (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A } +#define _VECTORMATH_PERM_ZXYW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_W } +#define _VECTORMATH_PERM_YZXW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_W } +#define _VECTORMATH_PERM_YZAB (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B } +#define _VECTORMATH_PERM_ZABC (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B, _VECTORMATH_PERM_C } +#define _VECTORMATH_PERM_XYAW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_W } +#define _VECTORMATH_PERM_XAZW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_W } +#define _VECTORMATH_MASK_0xF000 (vec_uint4){ 0xffffffff, 0, 0, 0 } +#define _VECTORMATH_MASK_0x0F00 (vec_uint4){ 0, 0xffffffff, 0, 0 } +#define _VECTORMATH_MASK_0x00F0 (vec_uint4){ 0, 0, 0xffffffff, 0 } +#define _VECTORMATH_MASK_0x000F (vec_uint4){ 0, 0, 0, 0xffffffff } +#define _VECTORMATH_UNIT_1000 _mm_setr_ps(1.0f,0.0f,0.0f,0.0f) // (__m128){ 1.0f, 0.0f, 0.0f, 0.0f } +#define _VECTORMATH_UNIT_0100 _mm_setr_ps(0.0f,1.0f,0.0f,0.0f) // (__m128){ 0.0f, 1.0f, 0.0f, 0.0f } +#define _VECTORMATH_UNIT_0010 _mm_setr_ps(0.0f,0.0f,1.0f,0.0f) // (__m128){ 0.0f, 0.0f, 1.0f, 0.0f } +#define _VECTORMATH_UNIT_0001 _mm_setr_ps(0.0f,0.0f,0.0f,1.0f) // (__m128){ 0.0f, 0.0f, 0.0f, 1.0f } +#define _VECTORMATH_SLERP_TOL 0.999f +//_VECTORMATH_SLERP_TOLF + +//----------------------------------------------------------------------------- +// Definitions + +#ifndef _VECTORMATH_INTERNAL_FUNCTIONS +#define _VECTORMATH_INTERNAL_FUNCTIONS + +#define _vmath_shufps(a, b, immx, immy, immz, immw) _mm_shuffle_ps(a, b, _MM_SHUFFLE(immw, immz, immy, immx)) +static __forceinline __m128 _vmathVfDot3( __m128 vec0, __m128 vec1 ) +{ + __m128 result = _mm_mul_ps( vec0, vec1); + return _mm_add_ps( vec_splat( result, 0 ), _mm_add_ps( vec_splat( result, 1 ), vec_splat( result, 2 ) ) ); +} + +static __forceinline __m128 _vmathVfDot4( __m128 vec0, __m128 vec1 ) +{ + __m128 result = _mm_mul_ps(vec0, vec1); + return _mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(0,0,0,0)), + _mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(1,1,1,1)), + _mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(2,2,2,2)), _mm_shuffle_ps(result, result, _MM_SHUFFLE(3,3,3,3))))); +} + +static __forceinline __m128 _vmathVfCross( __m128 vec0, __m128 vec1 ) +{ + __m128 tmp0, tmp1, tmp2, tmp3, result; + tmp0 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,0,2,1) ); + tmp1 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE(3,1,0,2) ); + tmp2 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,1,0,2) ); + tmp3 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE(3,0,2,1) ); + result = vec_mul( tmp0, tmp1 ); + result = vec_nmsub( tmp2, tmp3, result ); + return result; +} +/* +static __forceinline vec_uint4 _vmathVfToHalfFloatsUnpacked(__m128 v) +{ +#if 0 + vec_int4 bexp; + vec_uint4 mant, sign, hfloat; + vec_uint4 notZero, isInf; + const vec_uint4 hfloatInf = (vec_uint4)(0x00007c00u); + const vec_uint4 mergeMant = (vec_uint4)(0x000003ffu); + const vec_uint4 mergeSign = (vec_uint4)(0x00008000u); + + sign = vec_sr((vec_uint4)v, (vec_uint4)16); + mant = vec_sr((vec_uint4)v, (vec_uint4)13); + bexp = vec_and(vec_sr((vec_int4)v, (vec_uint4)23), (vec_int4)0xff); + + notZero = (vec_uint4)vec_cmpgt(bexp, (vec_int4)112); + isInf = (vec_uint4)vec_cmpgt(bexp, (vec_int4)142); + + bexp = _mm_add_ps(bexp, (vec_int4)-112); + bexp = vec_sl(bexp, (vec_uint4)10); + + hfloat = vec_sel((vec_uint4)bexp, mant, mergeMant); + hfloat = vec_sel((vec_uint4)(0), hfloat, notZero); + hfloat = vec_sel(hfloat, hfloatInf, isInf); + hfloat = vec_sel(hfloat, sign, mergeSign); + + return hfloat; +#else + assert(0); + return _mm_setzero_ps(); +#endif +} + +static __forceinline vec_ushort8 _vmath2VfToHalfFloats(__m128 u, __m128 v) +{ +#if 0 + vec_uint4 hfloat_u, hfloat_v; + const vec_uchar16 pack = (vec_uchar16){2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}; + hfloat_u = _vmathVfToHalfFloatsUnpacked(u); + hfloat_v = _vmathVfToHalfFloatsUnpacked(v); + return (vec_ushort8)vec_perm(hfloat_u, hfloat_v, pack); +#else + assert(0); + return _mm_setzero_si128(); +#endif +} +*/ + +static __forceinline __m128 _vmathVfInsert(__m128 dst, __m128 src, int slot) +{ + SSEFloat s; + s.m128 = src; + SSEFloat d; + d.m128 = dst; + d.f[slot] = s.f[slot]; + return d.m128; +} + +#define _vmathVfSetElement(vec, scalar, slot) ((float *)&(vec))[slot] = scalar + +static __forceinline __m128 _vmathVfSplatScalar(float scalar) +{ + return _mm_set1_ps(scalar); +} + +#endif + +namespace Vectormath { +namespace Aos { + + +#ifdef _VECTORMATH_NO_SCALAR_CAST +__forceinline VecIdx::operator floatInVec() const +{ + return floatInVec(ref, i); +} + +__forceinline float VecIdx::getAsFloat() const +#else +__forceinline VecIdx::operator float() const +#endif +{ + return ((float *)&ref)[i]; +} + +__forceinline float VecIdx::operator =( float scalar ) +{ + _vmathVfSetElement(ref, scalar, i); + return scalar; +} + +__forceinline floatInVec VecIdx::operator =( const floatInVec &scalar ) +{ + ref = _vmathVfInsert(ref, scalar.get128(), i); + return scalar; +} + +__forceinline floatInVec VecIdx::operator =( const VecIdx& scalar ) +{ + return *this = floatInVec(scalar.ref, scalar.i); +} + +__forceinline floatInVec VecIdx::operator *=( float scalar ) +{ + return *this *= floatInVec(scalar); +} + +__forceinline floatInVec VecIdx::operator *=( const floatInVec &scalar ) +{ + return *this = floatInVec(ref, i) * scalar; +} + +__forceinline floatInVec VecIdx::operator /=( float scalar ) +{ + return *this /= floatInVec(scalar); +} + +inline floatInVec VecIdx::operator /=( const floatInVec &scalar ) +{ + return *this = floatInVec(ref, i) / scalar; +} + +__forceinline floatInVec VecIdx::operator +=( float scalar ) +{ + return *this += floatInVec(scalar); +} + +__forceinline floatInVec VecIdx::operator +=( const floatInVec &scalar ) +{ + return *this = floatInVec(ref, i) + scalar; +} + +__forceinline floatInVec VecIdx::operator -=( float scalar ) +{ + return *this -= floatInVec(scalar); +} + +__forceinline floatInVec VecIdx::operator -=( const floatInVec &scalar ) +{ + return *this = floatInVec(ref, i) - scalar; +} + +__forceinline Vector3::Vector3(const Vector3& vec) +{ + set128(vec.get128()); +} + +__forceinline void Vector3::set128(vec_float4 vec) +{ + mVec128 = vec; +} + + +__forceinline Vector3::Vector3( float _x, float _y, float _z ) +{ + mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f); +} + +__forceinline Vector3::Vector3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z ) +{ + __m128 xz = _mm_unpacklo_ps( _x.get128(), _z.get128() ); + mVec128 = _mm_unpacklo_ps( xz, _y.get128() ); +} + +__forceinline Vector3::Vector3( const Point3 &pnt ) +{ + mVec128 = pnt.get128(); +} + +__forceinline Vector3::Vector3( float scalar ) +{ + mVec128 = floatInVec(scalar).get128(); +} + +__forceinline Vector3::Vector3( const floatInVec &scalar ) +{ + mVec128 = scalar.get128(); +} + +__forceinline Vector3::Vector3( __m128 vf4 ) +{ + mVec128 = vf4; +} + +__forceinline const Vector3 Vector3::xAxis( ) +{ + return Vector3( _VECTORMATH_UNIT_1000 ); +} + +__forceinline const Vector3 Vector3::yAxis( ) +{ + return Vector3( _VECTORMATH_UNIT_0100 ); +} + +__forceinline const Vector3 Vector3::zAxis( ) +{ + return Vector3( _VECTORMATH_UNIT_0010 ); +} + +__forceinline const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 ) +{ + return lerp( floatInVec(t), vec0, vec1 ); +} + +__forceinline const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 ) +{ + return ( vec0 + ( ( vec1 - vec0 ) * t ) ); +} + +__forceinline const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 ) +{ + return slerp( floatInVec(t), unitVec0, unitVec1 ); +} + +__forceinline const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 ) +{ + __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines; + cosAngle = _vmathVfDot3( unitVec0.get128(), unitVec1.get128() ); + __m128 selectMask = _mm_cmpgt_ps( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle ); + angle = acosf4( cosAngle ); + tttt = t.get128(); + oneMinusT = _mm_sub_ps( _mm_set1_ps(1.0f), tttt ); + angles = _mm_unpacklo_ps( _mm_set1_ps(1.0f), tttt ); // angles = 1, t, 1, t + angles = _mm_unpacklo_ps( angles, oneMinusT ); // angles = 1, 1-t, t, 1-t + angles = _mm_mul_ps( angles, angle ); + sines = sinf4( angles ); + scales = _mm_div_ps( sines, vec_splat( sines, 0 ) ); + scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask ); + scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask ); + return Vector3( vec_madd( unitVec0.get128(), scale0, _mm_mul_ps( unitVec1.get128(), scale1 ) ) ); +} + +__forceinline __m128 Vector3::get128( ) const +{ + return mVec128; +} + +__forceinline void loadXYZ(Vector3& vec, const float* fptr) +{ +#ifdef USE_SSE2_LDDQU + vec = Vector3( SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128 ); +#else + SSEFloat fl; + fl.f[0] = fptr[0]; + fl.f[1] = fptr[1]; + fl.f[2] = fptr[2]; + fl.f[3] = fptr[3]; + vec = Vector3( fl.m128); +#endif //USE_SSE2_LDDQU + +} + +__forceinline void storeXYZ( const Vector3 &vec, __m128 * quad ) +{ + __m128 dstVec = *quad; + __declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff}; // TODO: Centralize + dstVec = vec_sel(vec.get128(), dstVec, sw); + *quad = dstVec; +} + +__forceinline void storeXYZ(const Vector3& vec, float* fptr) +{ + fptr[0] = vec.getX(); + fptr[1] = vec.getY(); + fptr[2] = vec.getZ(); +} + + +__forceinline void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads ) +{ + const float *quads = (float *)threeQuads; + vec0 = Vector3( _mm_load_ps(quads) ); + vec1 = Vector3( _mm_loadu_ps(quads + 3) ); + vec2 = Vector3( _mm_loadu_ps(quads + 6) ); + vec3 = Vector3( _mm_loadu_ps(quads + 9) ); +} + +__forceinline void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads ) +{ + __m128 xxxx = _mm_shuffle_ps( vec1.get128(), vec1.get128(), _MM_SHUFFLE(0, 0, 0, 0) ); + __m128 zzzz = _mm_shuffle_ps( vec2.get128(), vec2.get128(), _MM_SHUFFLE(2, 2, 2, 2) ); + __declspec(align(16)) unsigned int xsw[4] = {0, 0, 0, 0xffffffff}; + __declspec(align(16)) unsigned int zsw[4] = {0xffffffff, 0, 0, 0}; + threeQuads[0] = vec_sel( vec0.get128(), xxxx, xsw ); + threeQuads[1] = _mm_shuffle_ps( vec1.get128(), vec2.get128(), _MM_SHUFFLE(1, 0, 2, 1) ); + threeQuads[2] = vec_sel( _mm_shuffle_ps( vec3.get128(), vec3.get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw ); +} +/* +__forceinline void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads ) +{ + assert(0); +#if 0 + __m128 xyz0[3]; + __m128 xyz1[3]; + storeXYZArray( vec0, vec1, vec2, vec3, xyz0 ); + storeXYZArray( vec4, vec5, vec6, vec7, xyz1 ); + threeQuads[0] = _vmath2VfToHalfFloats(xyz0[0], xyz0[1]); + threeQuads[1] = _vmath2VfToHalfFloats(xyz0[2], xyz1[0]); + threeQuads[2] = _vmath2VfToHalfFloats(xyz1[1], xyz1[2]); +#endif +} +*/ +__forceinline Vector3 & Vector3::operator =( const Vector3 &vec ) +{ + mVec128 = vec.mVec128; + return *this; +} + +__forceinline Vector3 & Vector3::setX( float _x ) +{ + _vmathVfSetElement(mVec128, _x, 0); + return *this; +} + +__forceinline Vector3 & Vector3::setX( const floatInVec &_x ) +{ + mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0); + return *this; +} + +__forceinline const floatInVec Vector3::getX( ) const +{ + return floatInVec( mVec128, 0 ); +} + +__forceinline Vector3 & Vector3::setY( float _y ) +{ + _vmathVfSetElement(mVec128, _y, 1); + return *this; +} + +__forceinline Vector3 & Vector3::setY( const floatInVec &_y ) +{ + mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1); + return *this; +} + +__forceinline const floatInVec Vector3::getY( ) const +{ + return floatInVec( mVec128, 1 ); +} + +__forceinline Vector3 & Vector3::setZ( float _z ) +{ + _vmathVfSetElement(mVec128, _z, 2); + return *this; +} + +__forceinline Vector3 & Vector3::setZ( const floatInVec &_z ) +{ + mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2); + return *this; +} + +__forceinline const floatInVec Vector3::getZ( ) const +{ + return floatInVec( mVec128, 2 ); +} + +__forceinline Vector3 & Vector3::setElem( int idx, float value ) +{ + _vmathVfSetElement(mVec128, value, idx); + return *this; +} + +__forceinline Vector3 & Vector3::setElem( int idx, const floatInVec &value ) +{ + mVec128 = _vmathVfInsert(mVec128, value.get128(), idx); + return *this; +} + +__forceinline const floatInVec Vector3::getElem( int idx ) const +{ + return floatInVec( mVec128, idx ); +} + +__forceinline VecIdx Vector3::operator []( int idx ) +{ + return VecIdx( mVec128, idx ); +} + +__forceinline const floatInVec Vector3::operator []( int idx ) const +{ + return floatInVec( mVec128, idx ); +} + +__forceinline const Vector3 Vector3::operator +( const Vector3 &vec ) const +{ + return Vector3( _mm_add_ps( mVec128, vec.mVec128 ) ); +} + +__forceinline const Vector3 Vector3::operator -( const Vector3 &vec ) const +{ + return Vector3( _mm_sub_ps( mVec128, vec.mVec128 ) ); +} + +__forceinline const Point3 Vector3::operator +( const Point3 &pnt ) const +{ + return Point3( _mm_add_ps( mVec128, pnt.get128() ) ); +} + +__forceinline const Vector3 Vector3::operator *( float scalar ) const +{ + return *this * floatInVec(scalar); +} + +__forceinline const Vector3 Vector3::operator *( const floatInVec &scalar ) const +{ + return Vector3( _mm_mul_ps( mVec128, scalar.get128() ) ); +} + +__forceinline Vector3 & Vector3::operator +=( const Vector3 &vec ) +{ + *this = *this + vec; + return *this; +} + +__forceinline Vector3 & Vector3::operator -=( const Vector3 &vec ) +{ + *this = *this - vec; + return *this; +} + +__forceinline Vector3 & Vector3::operator *=( float scalar ) +{ + *this = *this * scalar; + return *this; +} + +__forceinline Vector3 & Vector3::operator *=( const floatInVec &scalar ) +{ + *this = *this * scalar; + return *this; +} + +__forceinline const Vector3 Vector3::operator /( float scalar ) const +{ + return *this / floatInVec(scalar); +} + +__forceinline const Vector3 Vector3::operator /( const floatInVec &scalar ) const +{ + return Vector3( _mm_div_ps( mVec128, scalar.get128() ) ); +} + +__forceinline Vector3 & Vector3::operator /=( float scalar ) +{ + *this = *this / scalar; + return *this; +} + +__forceinline Vector3 & Vector3::operator /=( const floatInVec &scalar ) +{ + *this = *this / scalar; + return *this; +} + +__forceinline const Vector3 Vector3::operator -( ) const +{ + //return Vector3(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) ); + + __declspec(align(16)) static const int array[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; + __m128 NEG_MASK = SSEFloat(*(const vec_float4*)array).vf; + return Vector3(_mm_xor_ps(get128(),NEG_MASK)); +} + +__forceinline const Vector3 operator *( float scalar, const Vector3 &vec ) +{ + return floatInVec(scalar) * vec; +} + +__forceinline const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec ) +{ + return vec * scalar; +} + +__forceinline const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 ) +{ + return Vector3( _mm_mul_ps( vec0.get128(), vec1.get128() ) ); +} + +__forceinline const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 ) +{ + return Vector3( _mm_div_ps( vec0.get128(), vec1.get128() ) ); +} + +__forceinline const Vector3 recipPerElem( const Vector3 &vec ) +{ + return Vector3( _mm_rcp_ps( vec.get128() ) ); +} + +__forceinline const Vector3 absPerElem( const Vector3 &vec ) +{ + return Vector3( fabsf4( vec.get128() ) ); +} + +__forceinline const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 ) +{ + __m128 vmask = toM128(0x7fffffff); + return Vector3( _mm_or_ps( + _mm_and_ps ( vmask, vec0.get128() ), // Value + _mm_andnot_ps( vmask, vec1.get128() ) ) ); // Signs +} + +__forceinline const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 ) +{ + return Vector3( _mm_max_ps( vec0.get128(), vec1.get128() ) ); +} + +__forceinline const floatInVec maxElem( const Vector3 &vec ) +{ + return floatInVec( _mm_max_ps( _mm_max_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) ); +} + +__forceinline const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 ) +{ + return Vector3( _mm_min_ps( vec0.get128(), vec1.get128() ) ); +} + +__forceinline const floatInVec minElem( const Vector3 &vec ) +{ + return floatInVec( _mm_min_ps( _mm_min_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) ); +} + +__forceinline const floatInVec sum( const Vector3 &vec ) +{ + return floatInVec( _mm_add_ps( _mm_add_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) ); +} + +__forceinline const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 ) +{ + return floatInVec( _vmathVfDot3( vec0.get128(), vec1.get128() ), 0 ); +} + +__forceinline const floatInVec lengthSqr( const Vector3 &vec ) +{ + return floatInVec( _vmathVfDot3( vec.get128(), vec.get128() ), 0 ); +} + +__forceinline const floatInVec length( const Vector3 &vec ) +{ + return floatInVec( _mm_sqrt_ps(_vmathVfDot3( vec.get128(), vec.get128() )), 0 ); +} + + +__forceinline const Vector3 normalizeApprox( const Vector3 &vec ) +{ + return Vector3( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) ); +} + +__forceinline const Vector3 normalize( const Vector3 &vec ) +{ + return Vector3( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) ); +} + +__forceinline const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 ) +{ + return Vector3( _vmathVfCross( vec0.get128(), vec1.get128() ) ); +} + +__forceinline const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 ) +{ + return select( vec0, vec1, boolInVec(select1) ); +} + + +__forceinline const Vector4 select(const Vector4& vec0, const Vector4& vec1, const boolInVec& select1) +{ + return Vector4(vec_sel(vec0.get128(), vec1.get128(), select1.get128())); +} + +#ifdef _VECTORMATH_DEBUG + +__forceinline void print( const Vector3 &vec ) +{ + union { __m128 v; float s[4]; } tmp; + tmp.v = vec.get128(); + printf( "( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] ); +} + +__forceinline void print( const Vector3 &vec, const char * name ) +{ + union { __m128 v; float s[4]; } tmp; + tmp.v = vec.get128(); + printf( "%s: ( %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2] ); +} + +#endif + +__forceinline Vector4::Vector4( float _x, float _y, float _z, float _w ) +{ + mVec128 = _mm_setr_ps(_x, _y, _z, _w); + } + +__forceinline Vector4::Vector4( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w ) +{ + mVec128 = _mm_unpacklo_ps( + _mm_unpacklo_ps( _x.get128(), _z.get128() ), + _mm_unpacklo_ps( _y.get128(), _w.get128() ) ); +} + +__forceinline Vector4::Vector4( const Vector3 &xyz, float _w ) +{ + mVec128 = xyz.get128(); + _vmathVfSetElement(mVec128, _w, 3); +} + +__forceinline Vector4::Vector4( const Vector3 &xyz, const floatInVec &_w ) +{ + mVec128 = xyz.get128(); + mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3); +} + +__forceinline Vector4::Vector4( const Vector3 &vec ) +{ + mVec128 = vec.get128(); + mVec128 = _vmathVfInsert(mVec128, _mm_setzero_ps(), 3); +} + +__forceinline Vector4::Vector4( const Point3 &pnt ) +{ + mVec128 = pnt.get128(); + mVec128 = _vmathVfInsert(mVec128, _mm_set1_ps(1.0f), 3); +} + +__forceinline Vector4::Vector4( const Quat &quat ) +{ + mVec128 = quat.get128(); +} + +__forceinline Vector4::Vector4( float scalar ) +{ + mVec128 = floatInVec(scalar).get128(); +} + +__forceinline Vector4::Vector4( const floatInVec &scalar ) +{ + mVec128 = scalar.get128(); +} + +__forceinline Vector4::Vector4( __m128 vf4 ) +{ + mVec128 = vf4; +} + +__forceinline const Vector4 Vector4::xAxis( ) +{ + return Vector4( _VECTORMATH_UNIT_1000 ); +} + +__forceinline const Vector4 Vector4::yAxis( ) +{ + return Vector4( _VECTORMATH_UNIT_0100 ); +} + +__forceinline const Vector4 Vector4::zAxis( ) +{ + return Vector4( _VECTORMATH_UNIT_0010 ); +} + +__forceinline const Vector4 Vector4::wAxis( ) +{ + return Vector4( _VECTORMATH_UNIT_0001 ); +} + +__forceinline const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 ) +{ + return lerp( floatInVec(t), vec0, vec1 ); +} + +__forceinline const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 ) +{ + return ( vec0 + ( ( vec1 - vec0 ) * t ) ); +} + +__forceinline const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 ) +{ + return slerp( floatInVec(t), unitVec0, unitVec1 ); +} + +__forceinline const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 ) +{ + __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines; + cosAngle = _vmathVfDot4( unitVec0.get128(), unitVec1.get128() ); + __m128 selectMask = _mm_cmpgt_ps( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle ); + angle = acosf4( cosAngle ); + tttt = t.get128(); + oneMinusT = _mm_sub_ps( _mm_set1_ps(1.0f), tttt ); + angles = _mm_unpacklo_ps( _mm_set1_ps(1.0f), tttt ); // angles = 1, t, 1, t + angles = _mm_unpacklo_ps( angles, oneMinusT ); // angles = 1, 1-t, t, 1-t + angles = _mm_mul_ps( angles, angle ); + sines = sinf4( angles ); + scales = _mm_div_ps( sines, vec_splat( sines, 0 ) ); + scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask ); + scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask ); + return Vector4( vec_madd( unitVec0.get128(), scale0, _mm_mul_ps( unitVec1.get128(), scale1 ) ) ); +} + +__forceinline __m128 Vector4::get128( ) const +{ + return mVec128; +} +/* +__forceinline void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads ) +{ + twoQuads[0] = _vmath2VfToHalfFloats(vec0.get128(), vec1.get128()); + twoQuads[1] = _vmath2VfToHalfFloats(vec2.get128(), vec3.get128()); +} +*/ +__forceinline Vector4 & Vector4::operator =( const Vector4 &vec ) +{ + mVec128 = vec.mVec128; + return *this; +} + +__forceinline Vector4 & Vector4::setXYZ( const Vector3 &vec ) +{ + __declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff}; + mVec128 = vec_sel( vec.get128(), mVec128, sw ); + return *this; +} + +__forceinline const Vector3 Vector4::getXYZ( ) const +{ + return Vector3( mVec128 ); +} + +__forceinline Vector4 & Vector4::setX( float _x ) +{ + _vmathVfSetElement(mVec128, _x, 0); + return *this; +} + +__forceinline Vector4 & Vector4::setX( const floatInVec &_x ) +{ + mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0); + return *this; +} + +__forceinline const floatInVec Vector4::getX( ) const +{ + return floatInVec( mVec128, 0 ); +} + +__forceinline Vector4 & Vector4::setY( float _y ) +{ + _vmathVfSetElement(mVec128, _y, 1); + return *this; +} + +__forceinline Vector4 & Vector4::setY( const floatInVec &_y ) +{ + mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1); + return *this; +} + +__forceinline const floatInVec Vector4::getY( ) const +{ + return floatInVec( mVec128, 1 ); +} + +__forceinline Vector4 & Vector4::setZ( float _z ) +{ + _vmathVfSetElement(mVec128, _z, 2); + return *this; +} + +__forceinline Vector4 & Vector4::setZ( const floatInVec &_z ) +{ + mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2); + return *this; +} + +__forceinline const floatInVec Vector4::getZ( ) const +{ + return floatInVec( mVec128, 2 ); +} + +__forceinline Vector4 & Vector4::setW( float _w ) +{ + _vmathVfSetElement(mVec128, _w, 3); + return *this; +} + +__forceinline Vector4 & Vector4::setW( const floatInVec &_w ) +{ + mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3); + return *this; +} + +__forceinline const floatInVec Vector4::getW( ) const +{ + return floatInVec( mVec128, 3 ); +} + +__forceinline Vector4 & Vector4::setElem( int idx, float value ) +{ + _vmathVfSetElement(mVec128, value, idx); + return *this; +} + +__forceinline Vector4 & Vector4::setElem( int idx, const floatInVec &value ) +{ + mVec128 = _vmathVfInsert(mVec128, value.get128(), idx); + return *this; +} + +__forceinline const floatInVec Vector4::getElem( int idx ) const +{ + return floatInVec( mVec128, idx ); +} + +__forceinline VecIdx Vector4::operator []( int idx ) +{ + return VecIdx( mVec128, idx ); +} + +__forceinline const floatInVec Vector4::operator []( int idx ) const +{ + return floatInVec( mVec128, idx ); +} + +__forceinline const Vector4 Vector4::operator +( const Vector4 &vec ) const +{ + return Vector4( _mm_add_ps( mVec128, vec.mVec128 ) ); +} + +__forceinline const Vector4 Vector4::operator -( const Vector4 &vec ) const +{ + return Vector4( _mm_sub_ps( mVec128, vec.mVec128 ) ); +} + +__forceinline const Vector4 Vector4::operator *( float scalar ) const +{ + return *this * floatInVec(scalar); +} + +__forceinline const Vector4 Vector4::operator *( const floatInVec &scalar ) const +{ + return Vector4( _mm_mul_ps( mVec128, scalar.get128() ) ); +} + +__forceinline Vector4 & Vector4::operator +=( const Vector4 &vec ) +{ + *this = *this + vec; + return *this; +} + +__forceinline Vector4 & Vector4::operator -=( const Vector4 &vec ) +{ + *this = *this - vec; + return *this; +} + +__forceinline Vector4 & Vector4::operator *=( float scalar ) +{ + *this = *this * scalar; + return *this; +} + +__forceinline Vector4 & Vector4::operator *=( const floatInVec &scalar ) +{ + *this = *this * scalar; + return *this; +} + +__forceinline const Vector4 Vector4::operator /( float scalar ) const +{ + return *this / floatInVec(scalar); +} + +__forceinline const Vector4 Vector4::operator /( const floatInVec &scalar ) const +{ + return Vector4( _mm_div_ps( mVec128, scalar.get128() ) ); +} + +__forceinline Vector4 & Vector4::operator /=( float scalar ) +{ + *this = *this / scalar; + return *this; +} + +__forceinline Vector4 & Vector4::operator /=( const floatInVec &scalar ) +{ + *this = *this / scalar; + return *this; +} + +__forceinline const Vector4 Vector4::operator -( ) const +{ + return Vector4(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) ); +} + +__forceinline const Vector4 operator *( float scalar, const Vector4 &vec ) +{ + return floatInVec(scalar) * vec; +} + +__forceinline const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec ) +{ + return vec * scalar; +} + +__forceinline const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 ) +{ + return Vector4( _mm_mul_ps( vec0.get128(), vec1.get128() ) ); +} + +__forceinline const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 ) +{ + return Vector4( _mm_div_ps( vec0.get128(), vec1.get128() ) ); +} + +__forceinline const Vector4 recipPerElem( const Vector4 &vec ) +{ + return Vector4( _mm_rcp_ps( vec.get128() ) ); +} + +__forceinline const Vector4 absPerElem( const Vector4 &vec ) +{ + return Vector4( fabsf4( vec.get128() ) ); +} + +__forceinline const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 ) +{ + __m128 vmask = toM128(0x7fffffff); + return Vector4( _mm_or_ps( + _mm_and_ps ( vmask, vec0.get128() ), // Value + _mm_andnot_ps( vmask, vec1.get128() ) ) ); // Signs +} + +__forceinline const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 ) +{ + return Vector4( _mm_max_ps( vec0.get128(), vec1.get128() ) ); +} + +__forceinline const floatInVec maxElem( const Vector4 &vec ) +{ + return floatInVec( _mm_max_ps( + _mm_max_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), + _mm_max_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) ); +} + +__forceinline const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 ) +{ + return Vector4( _mm_min_ps( vec0.get128(), vec1.get128() ) ); +} + +__forceinline const floatInVec minElem( const Vector4 &vec ) +{ + return floatInVec( _mm_min_ps( + _mm_min_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), + _mm_min_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) ); +} + +__forceinline const floatInVec sum( const Vector4 &vec ) +{ + return floatInVec( _mm_add_ps( + _mm_add_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), + _mm_add_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) ); +} + +__forceinline const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 ) +{ + return floatInVec( _vmathVfDot4( vec0.get128(), vec1.get128() ), 0 ); +} + +__forceinline const floatInVec lengthSqr( const Vector4 &vec ) +{ + return floatInVec( _vmathVfDot4( vec.get128(), vec.get128() ), 0 ); +} + +__forceinline const floatInVec length( const Vector4 &vec ) +{ + return floatInVec( _mm_sqrt_ps(_vmathVfDot4( vec.get128(), vec.get128() )), 0 ); +} + +__forceinline const Vector4 normalizeApprox( const Vector4 &vec ) +{ + return Vector4( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) ); +} + +__forceinline const Vector4 normalize( const Vector4 &vec ) +{ + return Vector4( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) ); +} + +__forceinline const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 ) +{ + return select( vec0, vec1, boolInVec(select1) ); +} + + +#ifdef _VECTORMATH_DEBUG + +__forceinline void print( const Vector4 &vec ) +{ + union { __m128 v; float s[4]; } tmp; + tmp.v = vec.get128(); + printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] ); +} + +__forceinline void print( const Vector4 &vec, const char * name ) +{ + union { __m128 v; float s[4]; } tmp; + tmp.v = vec.get128(); + printf( "%s: ( %f %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] ); +} + +#endif + +__forceinline Point3::Point3( float _x, float _y, float _z ) +{ + mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f); +} + +__forceinline Point3::Point3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z ) +{ + mVec128 = _mm_unpacklo_ps( _mm_unpacklo_ps( _x.get128(), _z.get128() ), _y.get128() ); +} + +__forceinline Point3::Point3( const Vector3 &vec ) +{ + mVec128 = vec.get128(); +} + +__forceinline Point3::Point3( float scalar ) +{ + mVec128 = floatInVec(scalar).get128(); +} + +__forceinline Point3::Point3( const floatInVec &scalar ) +{ + mVec128 = scalar.get128(); +} + +__forceinline Point3::Point3( __m128 vf4 ) +{ + mVec128 = vf4; +} + +__forceinline const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 ) +{ + return lerp( floatInVec(t), pnt0, pnt1 ); +} + +__forceinline const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 ) +{ + return ( pnt0 + ( ( pnt1 - pnt0 ) * t ) ); +} + +__forceinline __m128 Point3::get128( ) const +{ + return mVec128; +} + +__forceinline void storeXYZ( const Point3 &pnt, __m128 * quad ) +{ + __m128 dstVec = *quad; + __declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff}; // TODO: Centralize + dstVec = vec_sel(pnt.get128(), dstVec, sw); + *quad = dstVec; +} + +__forceinline void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads ) +{ + const float *quads = (float *)threeQuads; + pnt0 = Point3( _mm_load_ps(quads) ); + pnt1 = Point3( _mm_loadu_ps(quads + 3) ); + pnt2 = Point3( _mm_loadu_ps(quads + 6) ); + pnt3 = Point3( _mm_loadu_ps(quads + 9) ); +} + +__forceinline void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads ) +{ + __m128 xxxx = _mm_shuffle_ps( pnt1.get128(), pnt1.get128(), _MM_SHUFFLE(0, 0, 0, 0) ); + __m128 zzzz = _mm_shuffle_ps( pnt2.get128(), pnt2.get128(), _MM_SHUFFLE(2, 2, 2, 2) ); + __declspec(align(16)) unsigned int xsw[4] = {0, 0, 0, 0xffffffff}; + __declspec(align(16)) unsigned int zsw[4] = {0xffffffff, 0, 0, 0}; + threeQuads[0] = vec_sel( pnt0.get128(), xxxx, xsw ); + threeQuads[1] = _mm_shuffle_ps( pnt1.get128(), pnt2.get128(), _MM_SHUFFLE(1, 0, 2, 1) ); + threeQuads[2] = vec_sel( _mm_shuffle_ps( pnt3.get128(), pnt3.get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw ); +} +/* +__forceinline void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads ) +{ +#if 0 + __m128 xyz0[3]; + __m128 xyz1[3]; + storeXYZArray( pnt0, pnt1, pnt2, pnt3, xyz0 ); + storeXYZArray( pnt4, pnt5, pnt6, pnt7, xyz1 ); + threeQuads[0] = _vmath2VfToHalfFloats(xyz0[0], xyz0[1]); + threeQuads[1] = _vmath2VfToHalfFloats(xyz0[2], xyz1[0]); + threeQuads[2] = _vmath2VfToHalfFloats(xyz1[1], xyz1[2]); +#else + assert(0); +#endif +} +*/ +__forceinline Point3 & Point3::operator =( const Point3 &pnt ) +{ + mVec128 = pnt.mVec128; + return *this; +} + +__forceinline Point3 & Point3::setX( float _x ) +{ + _vmathVfSetElement(mVec128, _x, 0); + return *this; +} + +__forceinline Point3 & Point3::setX( const floatInVec &_x ) +{ + mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0); + return *this; +} + +__forceinline const floatInVec Point3::getX( ) const +{ + return floatInVec( mVec128, 0 ); +} + +__forceinline Point3 & Point3::setY( float _y ) +{ + _vmathVfSetElement(mVec128, _y, 1); + return *this; +} + +__forceinline Point3 & Point3::setY( const floatInVec &_y ) +{ + mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1); + return *this; +} + +__forceinline const floatInVec Point3::getY( ) const +{ + return floatInVec( mVec128, 1 ); +} + +__forceinline Point3 & Point3::setZ( float _z ) +{ + _vmathVfSetElement(mVec128, _z, 2); + return *this; +} + +__forceinline Point3 & Point3::setZ( const floatInVec &_z ) +{ + mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2); + return *this; +} + +__forceinline const floatInVec Point3::getZ( ) const +{ + return floatInVec( mVec128, 2 ); +} + +__forceinline Point3 & Point3::setElem( int idx, float value ) +{ + _vmathVfSetElement(mVec128, value, idx); + return *this; +} + +__forceinline Point3 & Point3::setElem( int idx, const floatInVec &value ) +{ + mVec128 = _vmathVfInsert(mVec128, value.get128(), idx); + return *this; +} + +__forceinline const floatInVec Point3::getElem( int idx ) const +{ + return floatInVec( mVec128, idx ); +} + +__forceinline VecIdx Point3::operator []( int idx ) +{ + return VecIdx( mVec128, idx ); +} + +__forceinline const floatInVec Point3::operator []( int idx ) const +{ + return floatInVec( mVec128, idx ); +} + +__forceinline const Vector3 Point3::operator -( const Point3 &pnt ) const +{ + return Vector3( _mm_sub_ps( mVec128, pnt.mVec128 ) ); +} + +__forceinline const Point3 Point3::operator +( const Vector3 &vec ) const +{ + return Point3( _mm_add_ps( mVec128, vec.get128() ) ); +} + +__forceinline const Point3 Point3::operator -( const Vector3 &vec ) const +{ + return Point3( _mm_sub_ps( mVec128, vec.get128() ) ); +} + +__forceinline Point3 & Point3::operator +=( const Vector3 &vec ) +{ + *this = *this + vec; + return *this; +} + +__forceinline Point3 & Point3::operator -=( const Vector3 &vec ) +{ + *this = *this - vec; + return *this; +} + +__forceinline const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 ) +{ + return Point3( _mm_mul_ps( pnt0.get128(), pnt1.get128() ) ); +} + +__forceinline const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 ) +{ + return Point3( _mm_div_ps( pnt0.get128(), pnt1.get128() ) ); +} + +__forceinline const Point3 recipPerElem( const Point3 &pnt ) +{ + return Point3( _mm_rcp_ps( pnt.get128() ) ); +} + +__forceinline const Point3 absPerElem( const Point3 &pnt ) +{ + return Point3( fabsf4( pnt.get128() ) ); +} + +__forceinline const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 ) +{ + __m128 vmask = toM128(0x7fffffff); + return Point3( _mm_or_ps( + _mm_and_ps ( vmask, pnt0.get128() ), // Value + _mm_andnot_ps( vmask, pnt1.get128() ) ) ); // Signs +} + +__forceinline const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 ) +{ + return Point3( _mm_max_ps( pnt0.get128(), pnt1.get128() ) ); +} + +__forceinline const floatInVec maxElem( const Point3 &pnt ) +{ + return floatInVec( _mm_max_ps( _mm_max_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) ); +} + +__forceinline const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 ) +{ + return Point3( _mm_min_ps( pnt0.get128(), pnt1.get128() ) ); +} + +__forceinline const floatInVec minElem( const Point3 &pnt ) +{ + return floatInVec( _mm_min_ps( _mm_min_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) ); +} + +__forceinline const floatInVec sum( const Point3 &pnt ) +{ + return floatInVec( _mm_add_ps( _mm_add_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) ); +} + +__forceinline const Point3 scale( const Point3 &pnt, float scaleVal ) +{ + return scale( pnt, floatInVec( scaleVal ) ); +} + +__forceinline const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal ) +{ + return mulPerElem( pnt, Point3( scaleVal ) ); +} + +__forceinline const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec ) +{ + return mulPerElem( pnt, Point3( scaleVec ) ); +} + +__forceinline const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec ) +{ + return floatInVec( _vmathVfDot3( pnt.get128(), unitVec.get128() ), 0 ); +} + +__forceinline const floatInVec distSqrFromOrigin( const Point3 &pnt ) +{ + return lengthSqr( Vector3( pnt ) ); +} + +__forceinline const floatInVec distFromOrigin( const Point3 &pnt ) +{ + return length( Vector3( pnt ) ); +} + +__forceinline const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 ) +{ + return lengthSqr( ( pnt1 - pnt0 ) ); +} + +__forceinline const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 ) +{ + return length( ( pnt1 - pnt0 ) ); +} + +__forceinline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 ) +{ + return select( pnt0, pnt1, boolInVec(select1) ); +} + +__forceinline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 ) +{ + return Point3( vec_sel( pnt0.get128(), pnt1.get128(), select1.get128() ) ); +} + + + +#ifdef _VECTORMATH_DEBUG + +__forceinline void print( const Point3 &pnt ) +{ + union { __m128 v; float s[4]; } tmp; + tmp.v = pnt.get128(); + printf( "( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] ); +} + +__forceinline void print( const Point3 &pnt, const char * name ) +{ + union { __m128 v; float s[4]; } tmp; + tmp.v = pnt.get128(); + printf( "%s: ( %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2] ); +} + +#endif + +} // namespace Aos +} // namespace Vectormath + +#endif diff --git a/src/BulletMultiThreaded/vectormath/sse/vecidx_aos.h b/src/BulletMultiThreaded/vectormath/sse/vecidx_aos.h new file mode 100644 index 000000000..c5c6eb908 --- /dev/null +++ b/src/BulletMultiThreaded/vectormath/sse/vecidx_aos.h @@ -0,0 +1,80 @@ +/* + Copyright (C) 2006, 2007 Sony Computer Entertainment Inc. + All rights reserved. + + Redistribution and use in source and binary forms, + with or without modification, are permitted provided that the + following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the Sony Computer Entertainment Inc nor the names + of its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _VECTORMATH_VECIDX_AOS_H +#define _VECTORMATH_VECIDX_AOS_H + + +#include "floatInVec.h" + +namespace Vectormath { +namespace Aos { + +//----------------------------------------------------------------------------- +// VecIdx +// Used in setting elements of Vector3, Vector4, Point3, or Quat with the +// subscripting operator. +// + +__declspec(align(16)) class VecIdx +{ +private: + __m128 &ref; + int i; +public: + inline VecIdx( __m128& vec, int idx ): ref(vec) { i = idx; } + + // implicitly casts to float unless _VECTORMATH_NO_SCALAR_CAST defined + // in which case, implicitly casts to floatInVec, and one must call + // getAsFloat to convert to float. + // +#ifdef _VECTORMATH_NO_SCALAR_CAST + inline operator floatInVec() const; + inline float getAsFloat() const; +#else + inline operator float() const; +#endif + + inline float operator =( float scalar ); + inline floatInVec operator =( const floatInVec &scalar ); + inline floatInVec operator =( const VecIdx& scalar ); + inline floatInVec operator *=( float scalar ); + inline floatInVec operator *=( const floatInVec &scalar ); + inline floatInVec operator /=( float scalar ); + inline floatInVec operator /=( const floatInVec &scalar ); + inline floatInVec operator +=( float scalar ); + inline floatInVec operator +=( const floatInVec &scalar ); + inline floatInVec operator -=( float scalar ); + inline floatInVec operator -=( const floatInVec &scalar ); +}; + +} // namespace Aos +} // namespace Vectormath + +#endif diff --git a/src/BulletMultiThreaded/vectormath/sse/vectormath_aos.h b/src/BulletMultiThreaded/vectormath/sse/vectormath_aos.h new file mode 100644 index 000000000..0430ef7af --- /dev/null +++ b/src/BulletMultiThreaded/vectormath/sse/vectormath_aos.h @@ -0,0 +1,2532 @@ +/* + Copyright (C) 2006, 2007 Sony Computer Entertainment Inc. + All rights reserved. + + Redistribution and use in source and binary forms, + with or without modification, are permitted provided that the + following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the Sony Computer Entertainment Inc nor the names + of its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + + +#ifndef _VECTORMATH_AOS_CPP_SSE_H +#define _VECTORMATH_AOS_CPP_SSE_H + +#include +#include +#include +#include + +#define Vector3Ref Vector3& +#define QuatRef Quat& +#define Matrix3Ref Matrix3& +#define VM_ATTRIBUTE_ALIGNED16(a) __declspec(align(16)) a + +#define USE_SSE2_LDDQU +#ifdef USE_SSE2_LDDQU +#include //used for _mm_lddqu_si128 +#endif //USE_SSE2_LDDQU + +// TODO: Tidy +typedef __m128 vec_float4; +typedef __m128 vec_uint4; +typedef __m128 vec_int4; +typedef __m128i vec_uchar16; +typedef __m128i vec_ushort8; + +#define vec_splat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e,e,e,e)) + +#define _mm_ror_ps(vec,i) \ + (((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(i+3)%4,(unsigned char)(i+2)%4,(unsigned char)(i+1)%4,(unsigned char)(i+0)%4))) : (vec)) +#define _mm_rol_ps(vec,i) \ + (((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(7-i)%4,(unsigned char)(6-i)%4,(unsigned char)(5-i)%4,(unsigned char)(4-i)%4))) : (vec)) + +#define vec_sld(vec,vec2,x) _mm_ror_ps(vec, ((x)/4)) + +#define _mm_abs_ps(vec) _mm_andnot_ps(_MASKSIGN_,vec) +#define _mm_neg_ps(vec) _mm_xor_ps(_MASKSIGN_,vec) + +#define vec_madd(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b) ) + +union SSEFloat +{ + __m128i vi; + __m128 m128; + __m128 vf; + unsigned int ui[4]; + unsigned short s[8]; + float f[4]; + SSEFloat(__m128 v) : m128(v) {} + SSEFloat(__m128i v) : vi(v) {} + SSEFloat() {}//uninitialized +}; + +static __forceinline __m128 vec_sel(__m128 a, __m128 b, __m128 mask) +{ + return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a)); +} +static __forceinline __m128 vec_sel(__m128 a, __m128 b, const unsigned int *_mask) +{ + return vec_sel(a, b, _mm_load_ps((float *)_mask)); +} +static __forceinline __m128 vec_sel(__m128 a, __m128 b, unsigned int _mask) +{ + return vec_sel(a, b, _mm_set1_ps(*(float *)&_mask)); +} + +static __forceinline __m128 toM128(unsigned int x) +{ + return _mm_set1_ps( *(float *)&x ); +} + +static __forceinline __m128 fabsf4(__m128 x) +{ + return _mm_and_ps( x, toM128( 0x7fffffff ) ); +} +/* +union SSE64 +{ + __m128 m128; + struct + { + __m64 m01; + __m64 m23; + } m64; +}; + +static __forceinline __m128 vec_cts(__m128 x, int a) +{ + assert(a == 0); // Only 2^0 supported + (void)a; + SSE64 sse64; + sse64.m64.m01 = _mm_cvttps_pi32(x); + sse64.m64.m23 = _mm_cvttps_pi32(_mm_ror_ps(x,2)); + _mm_empty(); + return sse64.m128; +} + +static __forceinline __m128 vec_ctf(__m128 x, int a) +{ + assert(a == 0); // Only 2^0 supported + (void)a; + SSE64 sse64; + sse64.m128 = x; + __m128 result =_mm_movelh_ps( + _mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m01), + _mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m23)); + _mm_empty(); + return result; +} +*/ +static __forceinline __m128 vec_cts(__m128 x, int a) +{ + assert(a == 0); // Only 2^0 supported + (void)a; + __m128i result = _mm_cvtps_epi32(x); + return (__m128 &)result; +} + +static __forceinline __m128 vec_ctf(__m128 x, int a) +{ + assert(a == 0); // Only 2^0 supported + (void)a; + return _mm_cvtepi32_ps((__m128i &)x); +} + +#define vec_nmsub(a,b,c) _mm_sub_ps( c, _mm_mul_ps( a, b ) ) +#define vec_sub(a,b) _mm_sub_ps( a, b ) +#define vec_add(a,b) _mm_add_ps( a, b ) +#define vec_mul(a,b) _mm_mul_ps( a, b ) +#define vec_xor(a,b) _mm_xor_ps( a, b ) +#define vec_and(a,b) _mm_and_ps( a, b ) +#define vec_cmpeq(a,b) _mm_cmpeq_ps( a, b ) +#define vec_cmpgt(a,b) _mm_cmpgt_ps( a, b ) + +#define vec_mergeh(a,b) _mm_unpacklo_ps( a, b ) +#define vec_mergel(a,b) _mm_unpackhi_ps( a, b ) + +#define vec_andc(a,b) _mm_andnot_ps( b, a ) + +#define sqrtf4(x) _mm_sqrt_ps( x ) +#define rsqrtf4(x) _mm_rsqrt_ps( x ) +#define recipf4(x) _mm_rcp_ps( x ) +#define negatef4(x) _mm_sub_ps( _mm_setzero_ps(), x ) + +static __forceinline __m128 newtonrapson_rsqrt4( const __m128 v ) +{ +#define _half4 _mm_setr_ps(.5f,.5f,.5f,.5f) +#define _three _mm_setr_ps(3.f,3.f,3.f,3.f) +const __m128 approx = _mm_rsqrt_ps( v ); +const __m128 muls = _mm_mul_ps(_mm_mul_ps(v, approx), approx); +return _mm_mul_ps(_mm_mul_ps(_half4, approx), _mm_sub_ps(_three, muls) ); +} + +static __forceinline __m128 acosf4(__m128 x) +{ + __m128 xabs = fabsf4(x); + __m128 select = _mm_cmplt_ps( x, _mm_setzero_ps() ); + __m128 t1 = sqrtf4(vec_sub(_mm_set1_ps(1.0f), xabs)); + + /* Instruction counts can be reduced if the polynomial was + * computed entirely from nested (dependent) fma's. However, + * to reduce the number of pipeline stalls, the polygon is evaluated + * in two halves (hi amd lo). + */ + __m128 xabs2 = _mm_mul_ps(xabs, xabs); + __m128 xabs4 = _mm_mul_ps(xabs2, xabs2); + __m128 hi = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0012624911f), + xabs, _mm_set1_ps(0.0066700901f)), + xabs, _mm_set1_ps(-0.0170881256f)), + xabs, _mm_set1_ps( 0.0308918810f)); + __m128 lo = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0501743046f), + xabs, _mm_set1_ps(0.0889789874f)), + xabs, _mm_set1_ps(-0.2145988016f)), + xabs, _mm_set1_ps( 1.5707963050f)); + + __m128 result = vec_madd(hi, xabs4, lo); + + // Adjust the result if x is negactive. + return vec_sel( + vec_mul(t1, result), // Positive + vec_nmsub(t1, result, _mm_set1_ps(3.1415926535898f)), // Negative + select); +} + +static __forceinline __m128 sinf4(vec_float4 x) +{ + +// +// Common constants used to evaluate sinf4/cosf4/tanf4 +// +#define _SINCOS_CC0 -0.0013602249f +#define _SINCOS_CC1 0.0416566950f +#define _SINCOS_CC2 -0.4999990225f +#define _SINCOS_SC0 -0.0001950727f +#define _SINCOS_SC1 0.0083320758f +#define _SINCOS_SC2 -0.1666665247f + +#define _SINCOS_KC1 1.57079625129f +#define _SINCOS_KC2 7.54978995489e-8f + + vec_float4 xl,xl2,xl3,res; + + // Range reduction using : xl = angle * TwoOverPi; + // + xl = vec_mul(x, _mm_set1_ps(0.63661977236f)); + + // Find the quadrant the angle falls in + // using: q = (int) (ceil(abs(xl))*sign(xl)) + // + vec_int4 q = vec_cts(xl,0); + + // Compute an offset based on the quadrant that the angle falls in + // + vec_int4 offset = _mm_and_ps(q,toM128(0x3)); + + // Remainder in range [-pi/4..pi/4] + // + vec_float4 qf = vec_ctf(q,0); + xl = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x)); + + // Compute x^2 and x^3 + // + xl2 = vec_mul(xl,xl); + xl3 = vec_mul(xl2,xl); + + // Compute both the sin and cos of the angles + // using a polynomial expression: + // cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and + // sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2) + // + + vec_float4 cx = + vec_madd( + vec_madd( + vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f)); + vec_float4 sx = + vec_madd( + vec_madd( + vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl); + + // Use the cosine when the offset is odd and the sin + // when the offset is even + // + res = vec_sel(cx,sx,vec_cmpeq(vec_and(offset, + toM128(0x1)), + _mm_setzero_ps())); + + // Flip the sign of the result when (offset mod 4) = 1 or 2 + // + return vec_sel( + vec_xor(toM128(0x80000000U), res), // Negative + res, // Positive + vec_cmpeq(vec_and(offset,toM128(0x2)),_mm_setzero_ps())); +} + +static __forceinline void sincosf4(vec_float4 x, vec_float4* s, vec_float4* c) +{ + vec_float4 xl,xl2,xl3; + vec_int4 offsetSin, offsetCos; + + // Range reduction using : xl = angle * TwoOverPi; + // + xl = vec_mul(x, _mm_set1_ps(0.63661977236f)); + + // Find the quadrant the angle falls in + // using: q = (int) (ceil(abs(xl))*sign(xl)) + // + //vec_int4 q = vec_cts(vec_add(xl,vec_sel(_mm_set1_ps(0.5f),xl,(0x80000000))),0); + vec_int4 q = vec_cts(xl,0); + + // Compute the offset based on the quadrant that the angle falls in. + // Add 1 to the offset for the cosine. + // + offsetSin = vec_and(q,toM128((int)0x3)); + __m128i temp = _mm_add_epi32(_mm_set1_epi32(1),(__m128i &)offsetSin); + offsetCos = (__m128 &)temp; + + // Remainder in range [-pi/4..pi/4] + // + vec_float4 qf = vec_ctf(q,0); + xl = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x)); + + // Compute x^2 and x^3 + // + xl2 = vec_mul(xl,xl); + xl3 = vec_mul(xl2,xl); + + // Compute both the sin and cos of the angles + // using a polynomial expression: + // cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and + // sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2) + // + vec_float4 cx = + vec_madd( + vec_madd( + vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f)); + vec_float4 sx = + vec_madd( + vec_madd( + vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl); + + // Use the cosine when the offset is odd and the sin + // when the offset is even + // + vec_uint4 sinMask = (vec_uint4)vec_cmpeq(vec_and(offsetSin,toM128(0x1)),_mm_setzero_ps()); + vec_uint4 cosMask = (vec_uint4)vec_cmpeq(vec_and(offsetCos,toM128(0x1)),_mm_setzero_ps()); + *s = vec_sel(cx,sx,sinMask); + *c = vec_sel(cx,sx,cosMask); + + // Flip the sign of the result when (offset mod 4) = 1 or 2 + // + sinMask = vec_cmpeq(vec_and(offsetSin,toM128(0x2)),_mm_setzero_ps()); + cosMask = vec_cmpeq(vec_and(offsetCos,toM128(0x2)),_mm_setzero_ps()); + + *s = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*s),*s,sinMask); + *c = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*c),*c,cosMask); +} + +#include "vecidx_aos.h" +#include "floatInVec.h" +#include "boolInVec.h" + +#ifdef _VECTORMATH_DEBUG +#include +#endif +namespace Vectormath { + +namespace Aos { + +//----------------------------------------------------------------------------- +// Forward Declarations +// + +class Vector3; +class Vector4; +class Point3; +class Quat; +class Matrix3; +class Matrix4; +class Transform3; + +// A 3-D vector in array-of-structures format +// +class Vector3 +{ + __m128 mVec128; + + __forceinline void set128(vec_float4 vec); + + __forceinline vec_float4& get128Ref(); + +public: + // Default constructor; does no initialization + // + __forceinline Vector3( ) { }; + + // Default copy constructor + // + __forceinline Vector3(const Vector3& vec); + + // Construct a 3-D vector from x, y, and z elements + // + __forceinline Vector3( float x, float y, float z ); + + // Construct a 3-D vector from x, y, and z elements (scalar data contained in vector data type) + // + __forceinline Vector3( const floatInVec &x, const floatInVec &y, const floatInVec &z ); + + // Copy elements from a 3-D point into a 3-D vector + // + explicit __forceinline Vector3( const Point3 &pnt ); + + // Set all elements of a 3-D vector to the same scalar value + // + explicit __forceinline Vector3( float scalar ); + + // Set all elements of a 3-D vector to the same scalar value (scalar data contained in vector data type) + // + explicit __forceinline Vector3( const floatInVec &scalar ); + + // Set vector float data in a 3-D vector + // + explicit __forceinline Vector3( __m128 vf4 ); + + // Get vector float data from a 3-D vector + // + __forceinline __m128 get128( ) const; + + // Assign one 3-D vector to another + // + __forceinline Vector3 & operator =( const Vector3 &vec ); + + // Set the x element of a 3-D vector + // + __forceinline Vector3 & setX( float x ); + + // Set the y element of a 3-D vector + // + __forceinline Vector3 & setY( float y ); + + // Set the z element of a 3-D vector + // + __forceinline Vector3 & setZ( float z ); + + // Set the x element of a 3-D vector (scalar data contained in vector data type) + // + __forceinline Vector3 & setX( const floatInVec &x ); + + // Set the y element of a 3-D vector (scalar data contained in vector data type) + // + __forceinline Vector3 & setY( const floatInVec &y ); + + // Set the z element of a 3-D vector (scalar data contained in vector data type) + // + __forceinline Vector3 & setZ( const floatInVec &z ); + + // Get the x element of a 3-D vector + // + __forceinline const floatInVec getX( ) const; + + // Get the y element of a 3-D vector + // + __forceinline const floatInVec getY( ) const; + + // Get the z element of a 3-D vector + // + __forceinline const floatInVec getZ( ) const; + + // Set an x, y, or z element of a 3-D vector by index + // + __forceinline Vector3 & setElem( int idx, float value ); + + // Set an x, y, or z element of a 3-D vector by index (scalar data contained in vector data type) + // + __forceinline Vector3 & setElem( int idx, const floatInVec &value ); + + // Get an x, y, or z element of a 3-D vector by index + // + __forceinline const floatInVec getElem( int idx ) const; + + // Subscripting operator to set or get an element + // + __forceinline VecIdx operator []( int idx ); + + // Subscripting operator to get an element + // + __forceinline const floatInVec operator []( int idx ) const; + + // Add two 3-D vectors + // + __forceinline const Vector3 operator +( const Vector3 &vec ) const; + + // Subtract a 3-D vector from another 3-D vector + // + __forceinline const Vector3 operator -( const Vector3 &vec ) const; + + // Add a 3-D vector to a 3-D point + // + __forceinline const Point3 operator +( const Point3 &pnt ) const; + + // Multiply a 3-D vector by a scalar + // + __forceinline const Vector3 operator *( float scalar ) const; + + // Divide a 3-D vector by a scalar + // + __forceinline const Vector3 operator /( float scalar ) const; + + // Multiply a 3-D vector by a scalar (scalar data contained in vector data type) + // + __forceinline const Vector3 operator *( const floatInVec &scalar ) const; + + // Divide a 3-D vector by a scalar (scalar data contained in vector data type) + // + __forceinline const Vector3 operator /( const floatInVec &scalar ) const; + + // Perform compound assignment and addition with a 3-D vector + // + __forceinline Vector3 & operator +=( const Vector3 &vec ); + + // Perform compound assignment and subtraction by a 3-D vector + // + __forceinline Vector3 & operator -=( const Vector3 &vec ); + + // Perform compound assignment and multiplication by a scalar + // + __forceinline Vector3 & operator *=( float scalar ); + + // Perform compound assignment and division by a scalar + // + __forceinline Vector3 & operator /=( float scalar ); + + // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type) + // + __forceinline Vector3 & operator *=( const floatInVec &scalar ); + + // Perform compound assignment and division by a scalar (scalar data contained in vector data type) + // + __forceinline Vector3 & operator /=( const floatInVec &scalar ); + + // Negate all elements of a 3-D vector + // + __forceinline const Vector3 operator -( ) const; + + // Construct x axis + // + static __forceinline const Vector3 xAxis( ); + + // Construct y axis + // + static __forceinline const Vector3 yAxis( ); + + // Construct z axis + // + static __forceinline const Vector3 zAxis( ); + +}; + +// Multiply a 3-D vector by a scalar +// +__forceinline const Vector3 operator *( float scalar, const Vector3 &vec ); + +// Multiply a 3-D vector by a scalar (scalar data contained in vector data type) +// +__forceinline const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec ); + +// Multiply two 3-D vectors per element +// +__forceinline const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 ); + +// Divide two 3-D vectors per element +// NOTE: +// Floating-point behavior matches standard library function divf4. +// +__forceinline const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 ); + +// Compute the reciprocal of a 3-D vector per element +// NOTE: +// Floating-point behavior matches standard library function recipf4. +// +__forceinline const Vector3 recipPerElem( const Vector3 &vec ); + +// Compute the absolute value of a 3-D vector per element +// +__forceinline const Vector3 absPerElem( const Vector3 &vec ); + +// Copy sign from one 3-D vector to another, per element +// +__forceinline const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 ); + +// Maximum of two 3-D vectors per element +// +__forceinline const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 ); + +// Minimum of two 3-D vectors per element +// +__forceinline const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 ); + +// Maximum element of a 3-D vector +// +__forceinline const floatInVec maxElem( const Vector3 &vec ); + +// Minimum element of a 3-D vector +// +__forceinline const floatInVec minElem( const Vector3 &vec ); + +// Compute the sum of all elements of a 3-D vector +// +__forceinline const floatInVec sum( const Vector3 &vec ); + +// Compute the dot product of two 3-D vectors +// +__forceinline const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 ); + +// Compute the square of the length of a 3-D vector +// +__forceinline const floatInVec lengthSqr( const Vector3 &vec ); + +// Compute the length of a 3-D vector +// +__forceinline const floatInVec length( const Vector3 &vec ); + +// Normalize a 3-D vector +// NOTE: +// The result is unpredictable when all elements of vec are at or near zero. +// +__forceinline const Vector3 normalize( const Vector3 &vec ); + +// Compute cross product of two 3-D vectors +// +__forceinline const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 ); + +// Outer product of two 3-D vectors +// +__forceinline const Matrix3 outer( const Vector3 &vec0, const Vector3 &vec1 ); + +// Pre-multiply a row vector by a 3x3 matrix +// NOTE: +// Slower than column post-multiply. +// +__forceinline const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat ); + +// Cross-product matrix of a 3-D vector +// +__forceinline const Matrix3 crossMatrix( const Vector3 &vec ); + +// Create cross-product matrix and multiply +// NOTE: +// Faster than separately creating a cross-product matrix and multiplying. +// +__forceinline const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat ); + +// Linear interpolation between two 3-D vectors +// NOTE: +// Does not clamp t between 0 and 1. +// +__forceinline const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 ); + +// Linear interpolation between two 3-D vectors (scalar data contained in vector data type) +// NOTE: +// Does not clamp t between 0 and 1. +// +__forceinline const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 ); + +// Spherical linear interpolation between two 3-D vectors +// NOTE: +// The result is unpredictable if the vectors point in opposite directions. +// Does not clamp t between 0 and 1. +// +__forceinline const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 ); + +// Spherical linear interpolation between two 3-D vectors (scalar data contained in vector data type) +// NOTE: +// The result is unpredictable if the vectors point in opposite directions. +// Does not clamp t between 0 and 1. +// +__forceinline const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 ); + +// Conditionally select between two 3-D vectors +// NOTE: +// This function uses a conditional select instruction to avoid a branch. +// However, the transfer of select1 to a VMX register may use more processing time than a branch. +// Use the boolInVec version for better performance. +// +__forceinline const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 ); + +// Conditionally select between two 3-D vectors (scalar data contained in vector data type) +// NOTE: +// This function uses a conditional select instruction to avoid a branch. +// +__forceinline const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, const boolInVec &select1 ); + +// Store x, y, and z elements of 3-D vector in first three words of a quadword, preserving fourth word +// +__forceinline void storeXYZ( const Vector3 &vec, __m128 * quad ); + +// Load four three-float 3-D vectors, stored in three quadwords +// +__forceinline void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads ); + +// Store four 3-D vectors in three quadwords +// +__forceinline void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads ); + +// Store eight 3-D vectors as half-floats +// +__forceinline void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads ); + +#ifdef _VECTORMATH_DEBUG + +// Print a 3-D vector +// NOTE: +// Function is only defined when _VECTORMATH_DEBUG is defined. +// +__forceinline void print( const Vector3 &vec ); + +// Print a 3-D vector and an associated string identifier +// NOTE: +// Function is only defined when _VECTORMATH_DEBUG is defined. +// +__forceinline void print( const Vector3 &vec, const char * name ); + +#endif + +// A 4-D vector in array-of-structures format +// +class Vector4 +{ + __m128 mVec128; + +public: + // Default constructor; does no initialization + // + __forceinline Vector4( ) { }; + + // Construct a 4-D vector from x, y, z, and w elements + // + __forceinline Vector4( float x, float y, float z, float w ); + + // Construct a 4-D vector from x, y, z, and w elements (scalar data contained in vector data type) + // + __forceinline Vector4( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w ); + + // Construct a 4-D vector from a 3-D vector and a scalar + // + __forceinline Vector4( const Vector3 &xyz, float w ); + + // Construct a 4-D vector from a 3-D vector and a scalar (scalar data contained in vector data type) + // + __forceinline Vector4( const Vector3 &xyz, const floatInVec &w ); + + // Copy x, y, and z from a 3-D vector into a 4-D vector, and set w to 0 + // + explicit __forceinline Vector4( const Vector3 &vec ); + + // Copy x, y, and z from a 3-D point into a 4-D vector, and set w to 1 + // + explicit __forceinline Vector4( const Point3 &pnt ); + + // Copy elements from a quaternion into a 4-D vector + // + explicit __forceinline Vector4( const Quat &quat ); + + // Set all elements of a 4-D vector to the same scalar value + // + explicit __forceinline Vector4( float scalar ); + + // Set all elements of a 4-D vector to the same scalar value (scalar data contained in vector data type) + // + explicit __forceinline Vector4( const floatInVec &scalar ); + + // Set vector float data in a 4-D vector + // + explicit __forceinline Vector4( __m128 vf4 ); + + // Get vector float data from a 4-D vector + // + __forceinline __m128 get128( ) const; + + // Assign one 4-D vector to another + // + __forceinline Vector4 & operator =( const Vector4 &vec ); + + // Set the x, y, and z elements of a 4-D vector + // NOTE: + // This function does not change the w element. + // + __forceinline Vector4 & setXYZ( const Vector3 &vec ); + + // Get the x, y, and z elements of a 4-D vector + // + __forceinline const Vector3 getXYZ( ) const; + + // Set the x element of a 4-D vector + // + __forceinline Vector4 & setX( float x ); + + // Set the y element of a 4-D vector + // + __forceinline Vector4 & setY( float y ); + + // Set the z element of a 4-D vector + // + __forceinline Vector4 & setZ( float z ); + + // Set the w element of a 4-D vector + // + __forceinline Vector4 & setW( float w ); + + // Set the x element of a 4-D vector (scalar data contained in vector data type) + // + __forceinline Vector4 & setX( const floatInVec &x ); + + // Set the y element of a 4-D vector (scalar data contained in vector data type) + // + __forceinline Vector4 & setY( const floatInVec &y ); + + // Set the z element of a 4-D vector (scalar data contained in vector data type) + // + __forceinline Vector4 & setZ( const floatInVec &z ); + + // Set the w element of a 4-D vector (scalar data contained in vector data type) + // + __forceinline Vector4 & setW( const floatInVec &w ); + + // Get the x element of a 4-D vector + // + __forceinline const floatInVec getX( ) const; + + // Get the y element of a 4-D vector + // + __forceinline const floatInVec getY( ) const; + + // Get the z element of a 4-D vector + // + __forceinline const floatInVec getZ( ) const; + + // Get the w element of a 4-D vector + // + __forceinline const floatInVec getW( ) const; + + // Set an x, y, z, or w element of a 4-D vector by index + // + __forceinline Vector4 & setElem( int idx, float value ); + + // Set an x, y, z, or w element of a 4-D vector by index (scalar data contained in vector data type) + // + __forceinline Vector4 & setElem( int idx, const floatInVec &value ); + + // Get an x, y, z, or w element of a 4-D vector by index + // + __forceinline const floatInVec getElem( int idx ) const; + + // Subscripting operator to set or get an element + // + __forceinline VecIdx operator []( int idx ); + + // Subscripting operator to get an element + // + __forceinline const floatInVec operator []( int idx ) const; + + // Add two 4-D vectors + // + __forceinline const Vector4 operator +( const Vector4 &vec ) const; + + // Subtract a 4-D vector from another 4-D vector + // + __forceinline const Vector4 operator -( const Vector4 &vec ) const; + + // Multiply a 4-D vector by a scalar + // + __forceinline const Vector4 operator *( float scalar ) const; + + // Divide a 4-D vector by a scalar + // + __forceinline const Vector4 operator /( float scalar ) const; + + // Multiply a 4-D vector by a scalar (scalar data contained in vector data type) + // + __forceinline const Vector4 operator *( const floatInVec &scalar ) const; + + // Divide a 4-D vector by a scalar (scalar data contained in vector data type) + // + __forceinline const Vector4 operator /( const floatInVec &scalar ) const; + + // Perform compound assignment and addition with a 4-D vector + // + __forceinline Vector4 & operator +=( const Vector4 &vec ); + + // Perform compound assignment and subtraction by a 4-D vector + // + __forceinline Vector4 & operator -=( const Vector4 &vec ); + + // Perform compound assignment and multiplication by a scalar + // + __forceinline Vector4 & operator *=( float scalar ); + + // Perform compound assignment and division by a scalar + // + __forceinline Vector4 & operator /=( float scalar ); + + // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type) + // + __forceinline Vector4 & operator *=( const floatInVec &scalar ); + + // Perform compound assignment and division by a scalar (scalar data contained in vector data type) + // + __forceinline Vector4 & operator /=( const floatInVec &scalar ); + + // Negate all elements of a 4-D vector + // + __forceinline const Vector4 operator -( ) const; + + // Construct x axis + // + static __forceinline const Vector4 xAxis( ); + + // Construct y axis + // + static __forceinline const Vector4 yAxis( ); + + // Construct z axis + // + static __forceinline const Vector4 zAxis( ); + + // Construct w axis + // + static __forceinline const Vector4 wAxis( ); + +}; + +// Multiply a 4-D vector by a scalar +// +__forceinline const Vector4 operator *( float scalar, const Vector4 &vec ); + +// Multiply a 4-D vector by a scalar (scalar data contained in vector data type) +// +__forceinline const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec ); + +// Multiply two 4-D vectors per element +// +__forceinline const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 ); + +// Divide two 4-D vectors per element +// NOTE: +// Floating-point behavior matches standard library function divf4. +// +__forceinline const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 ); + +// Compute the reciprocal of a 4-D vector per element +// NOTE: +// Floating-point behavior matches standard library function recipf4. +// +__forceinline const Vector4 recipPerElem( const Vector4 &vec ); + +// Compute the absolute value of a 4-D vector per element +// +__forceinline const Vector4 absPerElem( const Vector4 &vec ); + +// Copy sign from one 4-D vector to another, per element +// +__forceinline const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 ); + +// Maximum of two 4-D vectors per element +// +__forceinline const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 ); + +// Minimum of two 4-D vectors per element +// +__forceinline const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 ); + +// Maximum element of a 4-D vector +// +__forceinline const floatInVec maxElem( const Vector4 &vec ); + +// Minimum element of a 4-D vector +// +__forceinline const floatInVec minElem( const Vector4 &vec ); + +// Compute the sum of all elements of a 4-D vector +// +__forceinline const floatInVec sum( const Vector4 &vec ); + +// Compute the dot product of two 4-D vectors +// +__forceinline const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 ); + +// Compute the square of the length of a 4-D vector +// +__forceinline const floatInVec lengthSqr( const Vector4 &vec ); + +// Compute the length of a 4-D vector +// +__forceinline const floatInVec length( const Vector4 &vec ); + +// Normalize a 4-D vector +// NOTE: +// The result is unpredictable when all elements of vec are at or near zero. +// +__forceinline const Vector4 normalize( const Vector4 &vec ); + +// Outer product of two 4-D vectors +// +__forceinline const Matrix4 outer( const Vector4 &vec0, const Vector4 &vec1 ); + +// Linear interpolation between two 4-D vectors +// NOTE: +// Does not clamp t between 0 and 1. +// +__forceinline const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 ); + +// Linear interpolation between two 4-D vectors (scalar data contained in vector data type) +// NOTE: +// Does not clamp t between 0 and 1. +// +__forceinline const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 ); + +// Spherical linear interpolation between two 4-D vectors +// NOTE: +// The result is unpredictable if the vectors point in opposite directions. +// Does not clamp t between 0 and 1. +// +__forceinline const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 ); + +// Spherical linear interpolation between two 4-D vectors (scalar data contained in vector data type) +// NOTE: +// The result is unpredictable if the vectors point in opposite directions. +// Does not clamp t between 0 and 1. +// +__forceinline const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 ); + +// Conditionally select between two 4-D vectors +// NOTE: +// This function uses a conditional select instruction to avoid a branch. +// However, the transfer of select1 to a VMX register may use more processing time than a branch. +// Use the boolInVec version for better performance. +// +__forceinline const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 ); + +// Conditionally select between two 4-D vectors (scalar data contained in vector data type) +// NOTE: +// This function uses a conditional select instruction to avoid a branch. +// +__forceinline const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, const boolInVec &select1 ); + +// Store four 4-D vectors as half-floats +// +__forceinline void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads ); + +#ifdef _VECTORMATH_DEBUG + +// Print a 4-D vector +// NOTE: +// Function is only defined when _VECTORMATH_DEBUG is defined. +// +__forceinline void print( const Vector4 &vec ); + +// Print a 4-D vector and an associated string identifier +// NOTE: +// Function is only defined when _VECTORMATH_DEBUG is defined. +// +__forceinline void print( const Vector4 &vec, const char * name ); + +#endif + +// A 3-D point in array-of-structures format +// +class Point3 +{ + __m128 mVec128; + +public: + // Default constructor; does no initialization + // + __forceinline Point3( ) { }; + + // Construct a 3-D point from x, y, and z elements + // + __forceinline Point3( float x, float y, float z ); + + // Construct a 3-D point from x, y, and z elements (scalar data contained in vector data type) + // + __forceinline Point3( const floatInVec &x, const floatInVec &y, const floatInVec &z ); + + // Copy elements from a 3-D vector into a 3-D point + // + explicit __forceinline Point3( const Vector3 &vec ); + + // Set all elements of a 3-D point to the same scalar value + // + explicit __forceinline Point3( float scalar ); + + // Set all elements of a 3-D point to the same scalar value (scalar data contained in vector data type) + // + explicit __forceinline Point3( const floatInVec &scalar ); + + // Set vector float data in a 3-D point + // + explicit __forceinline Point3( __m128 vf4 ); + + // Get vector float data from a 3-D point + // + __forceinline __m128 get128( ) const; + + // Assign one 3-D point to another + // + __forceinline Point3 & operator =( const Point3 &pnt ); + + // Set the x element of a 3-D point + // + __forceinline Point3 & setX( float x ); + + // Set the y element of a 3-D point + // + __forceinline Point3 & setY( float y ); + + // Set the z element of a 3-D point + // + __forceinline Point3 & setZ( float z ); + + // Set the x element of a 3-D point (scalar data contained in vector data type) + // + __forceinline Point3 & setX( const floatInVec &x ); + + // Set the y element of a 3-D point (scalar data contained in vector data type) + // + __forceinline Point3 & setY( const floatInVec &y ); + + // Set the z element of a 3-D point (scalar data contained in vector data type) + // + __forceinline Point3 & setZ( const floatInVec &z ); + + // Get the x element of a 3-D point + // + __forceinline const floatInVec getX( ) const; + + // Get the y element of a 3-D point + // + __forceinline const floatInVec getY( ) const; + + // Get the z element of a 3-D point + // + __forceinline const floatInVec getZ( ) const; + + // Set an x, y, or z element of a 3-D point by index + // + __forceinline Point3 & setElem( int idx, float value ); + + // Set an x, y, or z element of a 3-D point by index (scalar data contained in vector data type) + // + __forceinline Point3 & setElem( int idx, const floatInVec &value ); + + // Get an x, y, or z element of a 3-D point by index + // + __forceinline const floatInVec getElem( int idx ) const; + + // Subscripting operator to set or get an element + // + __forceinline VecIdx operator []( int idx ); + + // Subscripting operator to get an element + // + __forceinline const floatInVec operator []( int idx ) const; + + // Subtract a 3-D point from another 3-D point + // + __forceinline const Vector3 operator -( const Point3 &pnt ) const; + + // Add a 3-D point to a 3-D vector + // + __forceinline const Point3 operator +( const Vector3 &vec ) const; + + // Subtract a 3-D vector from a 3-D point + // + __forceinline const Point3 operator -( const Vector3 &vec ) const; + + // Perform compound assignment and addition with a 3-D vector + // + __forceinline Point3 & operator +=( const Vector3 &vec ); + + // Perform compound assignment and subtraction by a 3-D vector + // + __forceinline Point3 & operator -=( const Vector3 &vec ); + +}; + +// Multiply two 3-D points per element +// +__forceinline const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 ); + +// Divide two 3-D points per element +// NOTE: +// Floating-point behavior matches standard library function divf4. +// +__forceinline const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 ); + +// Compute the reciprocal of a 3-D point per element +// NOTE: +// Floating-point behavior matches standard library function recipf4. +// +__forceinline const Point3 recipPerElem( const Point3 &pnt ); + +// Compute the absolute value of a 3-D point per element +// +__forceinline const Point3 absPerElem( const Point3 &pnt ); + +// Copy sign from one 3-D point to another, per element +// +__forceinline const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 ); + +// Maximum of two 3-D points per element +// +__forceinline const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 ); + +// Minimum of two 3-D points per element +// +__forceinline const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 ); + +// Maximum element of a 3-D point +// +__forceinline const floatInVec maxElem( const Point3 &pnt ); + +// Minimum element of a 3-D point +// +__forceinline const floatInVec minElem( const Point3 &pnt ); + +// Compute the sum of all elements of a 3-D point +// +__forceinline const floatInVec sum( const Point3 &pnt ); + +// Apply uniform scale to a 3-D point +// +__forceinline const Point3 scale( const Point3 &pnt, float scaleVal ); + +// Apply uniform scale to a 3-D point (scalar data contained in vector data type) +// +__forceinline const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal ); + +// Apply non-uniform scale to a 3-D point +// +__forceinline const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec ); + +// Scalar projection of a 3-D point on a unit-length 3-D vector +// +__forceinline const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec ); + +// Compute the square of the distance of a 3-D point from the coordinate-system origin +// +__forceinline const floatInVec distSqrFromOrigin( const Point3 &pnt ); + +// Compute the distance of a 3-D point from the coordinate-system origin +// +__forceinline const floatInVec distFromOrigin( const Point3 &pnt ); + +// Compute the square of the distance between two 3-D points +// +__forceinline const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 ); + +// Compute the distance between two 3-D points +// +__forceinline const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 ); + +// Linear interpolation between two 3-D points +// NOTE: +// Does not clamp t between 0 and 1. +// +__forceinline const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 ); + +// Linear interpolation between two 3-D points (scalar data contained in vector data type) +// NOTE: +// Does not clamp t between 0 and 1. +// +__forceinline const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 ); + +// Conditionally select between two 3-D points +// NOTE: +// This function uses a conditional select instruction to avoid a branch. +// However, the transfer of select1 to a VMX register may use more processing time than a branch. +// Use the boolInVec version for better performance. +// +__forceinline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 ); + +// Conditionally select between two 3-D points (scalar data contained in vector data type) +// NOTE: +// This function uses a conditional select instruction to avoid a branch. +// +__forceinline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 ); + +// Store x, y, and z elements of 3-D point in first three words of a quadword, preserving fourth word +// +__forceinline void storeXYZ( const Point3 &pnt, __m128 * quad ); + +// Load four three-float 3-D points, stored in three quadwords +// +__forceinline void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads ); + +// Store four 3-D points in three quadwords +// +__forceinline void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads ); + +// Store eight 3-D points as half-floats +// +__forceinline void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads ); + +#ifdef _VECTORMATH_DEBUG + +// Print a 3-D point +// NOTE: +// Function is only defined when _VECTORMATH_DEBUG is defined. +// +__forceinline void print( const Point3 &pnt ); + +// Print a 3-D point and an associated string identifier +// NOTE: +// Function is only defined when _VECTORMATH_DEBUG is defined. +// +__forceinline void print( const Point3 &pnt, const char * name ); + +#endif + +// A quaternion in array-of-structures format +// +class Quat +{ + __m128 mVec128; + +public: + // Default constructor; does no initialization + // + __forceinline Quat( ) { }; + + __forceinline Quat::Quat(const Quat& quat); + + // Construct a quaternion from x, y, z, and w elements + // + __forceinline Quat( float x, float y, float z, float w ); + + // Construct a quaternion from x, y, z, and w elements (scalar data contained in vector data type) + // + __forceinline Quat( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w ); + + // Construct a quaternion from a 3-D vector and a scalar + // + __forceinline Quat( const Vector3 &xyz, float w ); + + // Construct a quaternion from a 3-D vector and a scalar (scalar data contained in vector data type) + // + __forceinline Quat( const Vector3 &xyz, const floatInVec &w ); + + // Copy elements from a 4-D vector into a quaternion + // + explicit __forceinline Quat( const Vector4 &vec ); + + // Convert a rotation matrix to a unit-length quaternion + // + explicit __forceinline Quat( const Matrix3 & rotMat ); + + // Set all elements of a quaternion to the same scalar value + // + explicit __forceinline Quat( float scalar ); + + // Set all elements of a quaternion to the same scalar value (scalar data contained in vector data type) + // + explicit __forceinline Quat( const floatInVec &scalar ); + + // Set vector float data in a quaternion + // + explicit __forceinline Quat( __m128 vf4 ); + + // Get vector float data from a quaternion + // + __forceinline __m128 get128( ) const; + + // Set a quaterion from vector float data + // + __forceinline void set128(vec_float4 vec); + + // Assign one quaternion to another + // + __forceinline Quat & operator =( const Quat &quat ); + + // Set the x, y, and z elements of a quaternion + // NOTE: + // This function does not change the w element. + // + __forceinline Quat & setXYZ( const Vector3 &vec ); + + // Get the x, y, and z elements of a quaternion + // + __forceinline const Vector3 getXYZ( ) const; + + // Set the x element of a quaternion + // + __forceinline Quat & setX( float x ); + + // Set the y element of a quaternion + // + __forceinline Quat & setY( float y ); + + // Set the z element of a quaternion + // + __forceinline Quat & setZ( float z ); + + // Set the w element of a quaternion + // + __forceinline Quat & setW( float w ); + + // Set the x element of a quaternion (scalar data contained in vector data type) + // + __forceinline Quat & setX( const floatInVec &x ); + + // Set the y element of a quaternion (scalar data contained in vector data type) + // + __forceinline Quat & setY( const floatInVec &y ); + + // Set the z element of a quaternion (scalar data contained in vector data type) + // + __forceinline Quat & setZ( const floatInVec &z ); + + // Set the w element of a quaternion (scalar data contained in vector data type) + // + __forceinline Quat & setW( const floatInVec &w ); + + // Get the x element of a quaternion + // + __forceinline const floatInVec getX( ) const; + + // Get the y element of a quaternion + // + __forceinline const floatInVec getY( ) const; + + // Get the z element of a quaternion + // + __forceinline const floatInVec getZ( ) const; + + // Get the w element of a quaternion + // + __forceinline const floatInVec getW( ) const; + + // Set an x, y, z, or w element of a quaternion by index + // + __forceinline Quat & setElem( int idx, float value ); + + // Set an x, y, z, or w element of a quaternion by index (scalar data contained in vector data type) + // + __forceinline Quat & setElem( int idx, const floatInVec &value ); + + // Get an x, y, z, or w element of a quaternion by index + // + __forceinline const floatInVec getElem( int idx ) const; + + // Subscripting operator to set or get an element + // + __forceinline VecIdx operator []( int idx ); + + // Subscripting operator to get an element + // + __forceinline const floatInVec operator []( int idx ) const; + + // Add two quaternions + // + __forceinline const Quat operator +( const Quat &quat ) const; + + // Subtract a quaternion from another quaternion + // + __forceinline const Quat operator -( const Quat &quat ) const; + + // Multiply two quaternions + // + __forceinline const Quat operator *( const Quat &quat ) const; + + // Multiply a quaternion by a scalar + // + __forceinline const Quat operator *( float scalar ) const; + + // Divide a quaternion by a scalar + // + __forceinline const Quat operator /( float scalar ) const; + + // Multiply a quaternion by a scalar (scalar data contained in vector data type) + // + __forceinline const Quat operator *( const floatInVec &scalar ) const; + + // Divide a quaternion by a scalar (scalar data contained in vector data type) + // + __forceinline const Quat operator /( const floatInVec &scalar ) const; + + // Perform compound assignment and addition with a quaternion + // + __forceinline Quat & operator +=( const Quat &quat ); + + // Perform compound assignment and subtraction by a quaternion + // + __forceinline Quat & operator -=( const Quat &quat ); + + // Perform compound assignment and multiplication by a quaternion + // + __forceinline Quat & operator *=( const Quat &quat ); + + // Perform compound assignment and multiplication by a scalar + // + __forceinline Quat & operator *=( float scalar ); + + // Perform compound assignment and division by a scalar + // + __forceinline Quat & operator /=( float scalar ); + + // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type) + // + __forceinline Quat & operator *=( const floatInVec &scalar ); + + // Perform compound assignment and division by a scalar (scalar data contained in vector data type) + // + __forceinline Quat & operator /=( const floatInVec &scalar ); + + // Negate all elements of a quaternion + // + __forceinline const Quat operator -( ) const; + + // Construct an identity quaternion + // + static __forceinline const Quat identity( ); + + // Construct a quaternion to rotate between two unit-length 3-D vectors + // NOTE: + // The result is unpredictable if unitVec0 and unitVec1 point in opposite directions. + // + static __forceinline const Quat rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 ); + + // Construct a quaternion to rotate around a unit-length 3-D vector + // + static __forceinline const Quat rotation( float radians, const Vector3 &unitVec ); + + // Construct a quaternion to rotate around a unit-length 3-D vector (scalar data contained in vector data type) + // + static __forceinline const Quat rotation( const floatInVec &radians, const Vector3 &unitVec ); + + // Construct a quaternion to rotate around the x axis + // + static __forceinline const Quat rotationX( float radians ); + + // Construct a quaternion to rotate around the y axis + // + static __forceinline const Quat rotationY( float radians ); + + // Construct a quaternion to rotate around the z axis + // + static __forceinline const Quat rotationZ( float radians ); + + // Construct a quaternion to rotate around the x axis (scalar data contained in vector data type) + // + static __forceinline const Quat rotationX( const floatInVec &radians ); + + // Construct a quaternion to rotate around the y axis (scalar data contained in vector data type) + // + static __forceinline const Quat rotationY( const floatInVec &radians ); + + // Construct a quaternion to rotate around the z axis (scalar data contained in vector data type) + // + static __forceinline const Quat rotationZ( const floatInVec &radians ); + +}; + +// Multiply a quaternion by a scalar +// +__forceinline const Quat operator *( float scalar, const Quat &quat ); + +// Multiply a quaternion by a scalar (scalar data contained in vector data type) +// +__forceinline const Quat operator *( const floatInVec &scalar, const Quat &quat ); + +// Compute the conjugate of a quaternion +// +__forceinline const Quat conj( const Quat &quat ); + +// Use a unit-length quaternion to rotate a 3-D vector +// +__forceinline const Vector3 rotate( const Quat &unitQuat, const Vector3 &vec ); + +// Compute the dot product of two quaternions +// +__forceinline const floatInVec dot( const Quat &quat0, const Quat &quat1 ); + +// Compute the norm of a quaternion +// +__forceinline const floatInVec norm( const Quat &quat ); + +// Compute the length of a quaternion +// +__forceinline const floatInVec length( const Quat &quat ); + +// Normalize a quaternion +// NOTE: +// The result is unpredictable when all elements of quat are at or near zero. +// +__forceinline const Quat normalize( const Quat &quat ); + +// Linear interpolation between two quaternions +// NOTE: +// Does not clamp t between 0 and 1. +// +__forceinline const Quat lerp( float t, const Quat &quat0, const Quat &quat1 ); + +// Linear interpolation between two quaternions (scalar data contained in vector data type) +// NOTE: +// Does not clamp t between 0 and 1. +// +__forceinline const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 ); + +// Spherical linear interpolation between two quaternions +// NOTE: +// Interpolates along the shortest path between orientations. +// Does not clamp t between 0 and 1. +// +__forceinline const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 ); + +// Spherical linear interpolation between two quaternions (scalar data contained in vector data type) +// NOTE: +// Interpolates along the shortest path between orientations. +// Does not clamp t between 0 and 1. +// +__forceinline const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 ); + +// Spherical quadrangle interpolation +// +__forceinline const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 ); + +// Spherical quadrangle interpolation (scalar data contained in vector data type) +// +__forceinline const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 ); + +// Conditionally select between two quaternions +// NOTE: +// This function uses a conditional select instruction to avoid a branch. +// However, the transfer of select1 to a VMX register may use more processing time than a branch. +// Use the boolInVec version for better performance. +// +__forceinline const Quat select( const Quat &quat0, const Quat &quat1, bool select1 ); + +// Conditionally select between two quaternions (scalar data contained in vector data type) +// NOTE: +// This function uses a conditional select instruction to avoid a branch. +// +__forceinline const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 ); + +#ifdef _VECTORMATH_DEBUG + +// Print a quaternion +// NOTE: +// Function is only defined when _VECTORMATH_DEBUG is defined. +// +__forceinline void print( const Quat &quat ); + +// Print a quaternion and an associated string identifier +// NOTE: +// Function is only defined when _VECTORMATH_DEBUG is defined. +// +__forceinline void print( const Quat &quat, const char * name ); + +#endif + +// A 3x3 matrix in array-of-structures format +// +class Matrix3 +{ + Vector3 mCol0; + Vector3 mCol1; + Vector3 mCol2; + +public: + // Default constructor; does no initialization + // + __forceinline Matrix3( ) { }; + + // Copy a 3x3 matrix + // + __forceinline Matrix3( const Matrix3 & mat ); + + // Construct a 3x3 matrix containing the specified columns + // + __forceinline Matrix3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2 ); + + // Construct a 3x3 rotation matrix from a unit-length quaternion + // + explicit __forceinline Matrix3( const Quat &unitQuat ); + + // Set all elements of a 3x3 matrix to the same scalar value + // + explicit __forceinline Matrix3( float scalar ); + + // Set all elements of a 3x3 matrix to the same scalar value (scalar data contained in vector data type) + // + explicit __forceinline Matrix3( const floatInVec &scalar ); + + // Assign one 3x3 matrix to another + // + __forceinline Matrix3 & operator =( const Matrix3 & mat ); + + // Set column 0 of a 3x3 matrix + // + __forceinline Matrix3 & setCol0( const Vector3 &col0 ); + + // Set column 1 of a 3x3 matrix + // + __forceinline Matrix3 & setCol1( const Vector3 &col1 ); + + // Set column 2 of a 3x3 matrix + // + __forceinline Matrix3 & setCol2( const Vector3 &col2 ); + + // Get column 0 of a 3x3 matrix + // + __forceinline const Vector3 getCol0( ) const; + + // Get column 1 of a 3x3 matrix + // + __forceinline const Vector3 getCol1( ) const; + + // Get column 2 of a 3x3 matrix + // + __forceinline const Vector3 getCol2( ) const; + + // Set the column of a 3x3 matrix referred to by the specified index + // + __forceinline Matrix3 & setCol( int col, const Vector3 &vec ); + + // Set the row of a 3x3 matrix referred to by the specified index + // + __forceinline Matrix3 & setRow( int row, const Vector3 &vec ); + + // Get the column of a 3x3 matrix referred to by the specified index + // + __forceinline const Vector3 getCol( int col ) const; + + // Get the row of a 3x3 matrix referred to by the specified index + // + __forceinline const Vector3 getRow( int row ) const; + + // Subscripting operator to set or get a column + // + __forceinline Vector3 & operator []( int col ); + + // Subscripting operator to get a column + // + __forceinline const Vector3 operator []( int col ) const; + + // Set the element of a 3x3 matrix referred to by column and row indices + // + __forceinline Matrix3 & setElem( int col, int row, float val ); + + // Set the element of a 3x3 matrix referred to by column and row indices (scalar data contained in vector data type) + // + __forceinline Matrix3 & setElem( int col, int row, const floatInVec &val ); + + // Get the element of a 3x3 matrix referred to by column and row indices + // + __forceinline const floatInVec getElem( int col, int row ) const; + + // Add two 3x3 matrices + // + __forceinline const Matrix3 operator +( const Matrix3 & mat ) const; + + // Subtract a 3x3 matrix from another 3x3 matrix + // + __forceinline const Matrix3 operator -( const Matrix3 & mat ) const; + + // Negate all elements of a 3x3 matrix + // + __forceinline const Matrix3 operator -( ) const; + + // Multiply a 3x3 matrix by a scalar + // + __forceinline const Matrix3 operator *( float scalar ) const; + + // Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type) + // + __forceinline const Matrix3 operator *( const floatInVec &scalar ) const; + + // Multiply a 3x3 matrix by a 3-D vector + // + __forceinline const Vector3 operator *( const Vector3 &vec ) const; + + // Multiply two 3x3 matrices + // + __forceinline const Matrix3 operator *( const Matrix3 & mat ) const; + + // Perform compound assignment and addition with a 3x3 matrix + // + __forceinline Matrix3 & operator +=( const Matrix3 & mat ); + + // Perform compound assignment and subtraction by a 3x3 matrix + // + __forceinline Matrix3 & operator -=( const Matrix3 & mat ); + + // Perform compound assignment and multiplication by a scalar + // + __forceinline Matrix3 & operator *=( float scalar ); + + // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type) + // + __forceinline Matrix3 & operator *=( const floatInVec &scalar ); + + // Perform compound assignment and multiplication by a 3x3 matrix + // + __forceinline Matrix3 & operator *=( const Matrix3 & mat ); + + // Construct an identity 3x3 matrix + // + static __forceinline const Matrix3 identity( ); + + // Construct a 3x3 matrix to rotate around the x axis + // + static __forceinline const Matrix3 rotationX( float radians ); + + // Construct a 3x3 matrix to rotate around the y axis + // + static __forceinline const Matrix3 rotationY( float radians ); + + // Construct a 3x3 matrix to rotate around the z axis + // + static __forceinline const Matrix3 rotationZ( float radians ); + + // Construct a 3x3 matrix to rotate around the x axis (scalar data contained in vector data type) + // + static __forceinline const Matrix3 rotationX( const floatInVec &radians ); + + // Construct a 3x3 matrix to rotate around the y axis (scalar data contained in vector data type) + // + static __forceinline const Matrix3 rotationY( const floatInVec &radians ); + + // Construct a 3x3 matrix to rotate around the z axis (scalar data contained in vector data type) + // + static __forceinline const Matrix3 rotationZ( const floatInVec &radians ); + + // Construct a 3x3 matrix to rotate around the x, y, and z axes + // + static __forceinline const Matrix3 rotationZYX( const Vector3 &radiansXYZ ); + + // Construct a 3x3 matrix to rotate around a unit-length 3-D vector + // + static __forceinline const Matrix3 rotation( float radians, const Vector3 &unitVec ); + + // Construct a 3x3 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type) + // + static __forceinline const Matrix3 rotation( const floatInVec &radians, const Vector3 &unitVec ); + + // Construct a rotation matrix from a unit-length quaternion + // + static __forceinline const Matrix3 rotation( const Quat &unitQuat ); + + // Construct a 3x3 matrix to perform scaling + // + static __forceinline const Matrix3 scale( const Vector3 &scaleVec ); + +}; +// Multiply a 3x3 matrix by a scalar +// +__forceinline const Matrix3 operator *( float scalar, const Matrix3 & mat ); + +// Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type) +// +__forceinline const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat ); + +// Append (post-multiply) a scale transformation to a 3x3 matrix +// NOTE: +// Faster than creating and multiplying a scale transformation matrix. +// +__forceinline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec ); + +// Prepend (pre-multiply) a scale transformation to a 3x3 matrix +// NOTE: +// Faster than creating and multiplying a scale transformation matrix. +// +__forceinline const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat ); + +// Multiply two 3x3 matrices per element +// +__forceinline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 ); + +// Compute the absolute value of a 3x3 matrix per element +// +__forceinline const Matrix3 absPerElem( const Matrix3 & mat ); + +// Transpose of a 3x3 matrix +// +__forceinline const Matrix3 transpose( const Matrix3 & mat ); + +// Compute the inverse of a 3x3 matrix +// NOTE: +// Result is unpredictable when the determinant of mat is equal to or near 0. +// +__forceinline const Matrix3 inverse( const Matrix3 & mat ); + +// Determinant of a 3x3 matrix +// +__forceinline const floatInVec determinant( const Matrix3 & mat ); + +// Conditionally select between two 3x3 matrices +// NOTE: +// This function uses a conditional select instruction to avoid a branch. +// However, the transfer of select1 to a VMX register may use more processing time than a branch. +// Use the boolInVec version for better performance. +// +__forceinline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 ); + +// Conditionally select between two 3x3 matrices (scalar data contained in vector data type) +// NOTE: +// This function uses a conditional select instruction to avoid a branch. +// +__forceinline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 ); + +#ifdef _VECTORMATH_DEBUG + +// Print a 3x3 matrix +// NOTE: +// Function is only defined when _VECTORMATH_DEBUG is defined. +// +__forceinline void print( const Matrix3 & mat ); + +// Print a 3x3 matrix and an associated string identifier +// NOTE: +// Function is only defined when _VECTORMATH_DEBUG is defined. +// +__forceinline void print( const Matrix3 & mat, const char * name ); + +#endif + +// A 4x4 matrix in array-of-structures format +// +class Matrix4 +{ + Vector4 mCol0; + Vector4 mCol1; + Vector4 mCol2; + Vector4 mCol3; + +public: + // Default constructor; does no initialization + // + __forceinline Matrix4( ) { }; + + // Copy a 4x4 matrix + // + __forceinline Matrix4( const Matrix4 & mat ); + + // Construct a 4x4 matrix containing the specified columns + // + __forceinline Matrix4( const Vector4 &col0, const Vector4 &col1, const Vector4 &col2, const Vector4 &col3 ); + + // Construct a 4x4 matrix from a 3x4 transformation matrix + // + explicit __forceinline Matrix4( const Transform3 & mat ); + + // Construct a 4x4 matrix from a 3x3 matrix and a 3-D vector + // + __forceinline Matrix4( const Matrix3 & mat, const Vector3 &translateVec ); + + // Construct a 4x4 matrix from a unit-length quaternion and a 3-D vector + // + __forceinline Matrix4( const Quat &unitQuat, const Vector3 &translateVec ); + + // Set all elements of a 4x4 matrix to the same scalar value + // + explicit __forceinline Matrix4( float scalar ); + + // Set all elements of a 4x4 matrix to the same scalar value (scalar data contained in vector data type) + // + explicit __forceinline Matrix4( const floatInVec &scalar ); + + // Assign one 4x4 matrix to another + // + __forceinline Matrix4 & operator =( const Matrix4 & mat ); + + // Set the upper-left 3x3 submatrix + // NOTE: + // This function does not change the bottom row elements. + // + __forceinline Matrix4 & setUpper3x3( const Matrix3 & mat3 ); + + // Get the upper-left 3x3 submatrix of a 4x4 matrix + // + __forceinline const Matrix3 getUpper3x3( ) const; + + // Set translation component + // NOTE: + // This function does not change the bottom row elements. + // + __forceinline Matrix4 & setTranslation( const Vector3 &translateVec ); + + // Get the translation component of a 4x4 matrix + // + __forceinline const Vector3 getTranslation( ) const; + + // Set column 0 of a 4x4 matrix + // + __forceinline Matrix4 & setCol0( const Vector4 &col0 ); + + // Set column 1 of a 4x4 matrix + // + __forceinline Matrix4 & setCol1( const Vector4 &col1 ); + + // Set column 2 of a 4x4 matrix + // + __forceinline Matrix4 & setCol2( const Vector4 &col2 ); + + // Set column 3 of a 4x4 matrix + // + __forceinline Matrix4 & setCol3( const Vector4 &col3 ); + + // Get column 0 of a 4x4 matrix + // + __forceinline const Vector4 getCol0( ) const; + + // Get column 1 of a 4x4 matrix + // + __forceinline const Vector4 getCol1( ) const; + + // Get column 2 of a 4x4 matrix + // + __forceinline const Vector4 getCol2( ) const; + + // Get column 3 of a 4x4 matrix + // + __forceinline const Vector4 getCol3( ) const; + + // Set the column of a 4x4 matrix referred to by the specified index + // + __forceinline Matrix4 & setCol( int col, const Vector4 &vec ); + + // Set the row of a 4x4 matrix referred to by the specified index + // + __forceinline Matrix4 & setRow( int row, const Vector4 &vec ); + + // Get the column of a 4x4 matrix referred to by the specified index + // + __forceinline const Vector4 getCol( int col ) const; + + // Get the row of a 4x4 matrix referred to by the specified index + // + __forceinline const Vector4 getRow( int row ) const; + + // Subscripting operator to set or get a column + // + __forceinline Vector4 & operator []( int col ); + + // Subscripting operator to get a column + // + __forceinline const Vector4 operator []( int col ) const; + + // Set the element of a 4x4 matrix referred to by column and row indices + // + __forceinline Matrix4 & setElem( int col, int row, float val ); + + // Set the element of a 4x4 matrix referred to by column and row indices (scalar data contained in vector data type) + // + __forceinline Matrix4 & setElem( int col, int row, const floatInVec &val ); + + // Get the element of a 4x4 matrix referred to by column and row indices + // + __forceinline const floatInVec getElem( int col, int row ) const; + + // Add two 4x4 matrices + // + __forceinline const Matrix4 operator +( const Matrix4 & mat ) const; + + // Subtract a 4x4 matrix from another 4x4 matrix + // + __forceinline const Matrix4 operator -( const Matrix4 & mat ) const; + + // Negate all elements of a 4x4 matrix + // + __forceinline const Matrix4 operator -( ) const; + + // Multiply a 4x4 matrix by a scalar + // + __forceinline const Matrix4 operator *( float scalar ) const; + + // Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type) + // + __forceinline const Matrix4 operator *( const floatInVec &scalar ) const; + + // Multiply a 4x4 matrix by a 4-D vector + // + __forceinline const Vector4 operator *( const Vector4 &vec ) const; + + // Multiply a 4x4 matrix by a 3-D vector + // + __forceinline const Vector4 operator *( const Vector3 &vec ) const; + + // Multiply a 4x4 matrix by a 3-D point + // + __forceinline const Vector4 operator *( const Point3 &pnt ) const; + + // Multiply two 4x4 matrices + // + __forceinline const Matrix4 operator *( const Matrix4 & mat ) const; + + // Multiply a 4x4 matrix by a 3x4 transformation matrix + // + __forceinline const Matrix4 operator *( const Transform3 & tfrm ) const; + + // Perform compound assignment and addition with a 4x4 matrix + // + __forceinline Matrix4 & operator +=( const Matrix4 & mat ); + + // Perform compound assignment and subtraction by a 4x4 matrix + // + __forceinline Matrix4 & operator -=( const Matrix4 & mat ); + + // Perform compound assignment and multiplication by a scalar + // + __forceinline Matrix4 & operator *=( float scalar ); + + // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type) + // + __forceinline Matrix4 & operator *=( const floatInVec &scalar ); + + // Perform compound assignment and multiplication by a 4x4 matrix + // + __forceinline Matrix4 & operator *=( const Matrix4 & mat ); + + // Perform compound assignment and multiplication by a 3x4 transformation matrix + // + __forceinline Matrix4 & operator *=( const Transform3 & tfrm ); + + // Construct an identity 4x4 matrix + // + static __forceinline const Matrix4 identity( ); + + // Construct a 4x4 matrix to rotate around the x axis + // + static __forceinline const Matrix4 rotationX( float radians ); + + // Construct a 4x4 matrix to rotate around the y axis + // + static __forceinline const Matrix4 rotationY( float radians ); + + // Construct a 4x4 matrix to rotate around the z axis + // + static __forceinline const Matrix4 rotationZ( float radians ); + + // Construct a 4x4 matrix to rotate around the x axis (scalar data contained in vector data type) + // + static __forceinline const Matrix4 rotationX( const floatInVec &radians ); + + // Construct a 4x4 matrix to rotate around the y axis (scalar data contained in vector data type) + // + static __forceinline const Matrix4 rotationY( const floatInVec &radians ); + + // Construct a 4x4 matrix to rotate around the z axis (scalar data contained in vector data type) + // + static __forceinline const Matrix4 rotationZ( const floatInVec &radians ); + + // Construct a 4x4 matrix to rotate around the x, y, and z axes + // + static __forceinline const Matrix4 rotationZYX( const Vector3 &radiansXYZ ); + + // Construct a 4x4 matrix to rotate around a unit-length 3-D vector + // + static __forceinline const Matrix4 rotation( float radians, const Vector3 &unitVec ); + + // Construct a 4x4 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type) + // + static __forceinline const Matrix4 rotation( const floatInVec &radians, const Vector3 &unitVec ); + + // Construct a rotation matrix from a unit-length quaternion + // + static __forceinline const Matrix4 rotation( const Quat &unitQuat ); + + // Construct a 4x4 matrix to perform scaling + // + static __forceinline const Matrix4 scale( const Vector3 &scaleVec ); + + // Construct a 4x4 matrix to perform translation + // + static __forceinline const Matrix4 translation( const Vector3 &translateVec ); + + // Construct viewing matrix based on eye, position looked at, and up direction + // + static __forceinline const Matrix4 lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec ); + + // Construct a perspective projection matrix + // + static __forceinline const Matrix4 perspective( float fovyRadians, float aspect, float zNear, float zFar ); + + // Construct a perspective projection matrix based on frustum + // + static __forceinline const Matrix4 frustum( float left, float right, float bottom, float top, float zNear, float zFar ); + + // Construct an orthographic projection matrix + // + static __forceinline const Matrix4 orthographic( float left, float right, float bottom, float top, float zNear, float zFar ); + +}; +// Multiply a 4x4 matrix by a scalar +// +__forceinline const Matrix4 operator *( float scalar, const Matrix4 & mat ); + +// Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type) +// +__forceinline const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat ); + +// Append (post-multiply) a scale transformation to a 4x4 matrix +// NOTE: +// Faster than creating and multiplying a scale transformation matrix. +// +__forceinline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec ); + +// Prepend (pre-multiply) a scale transformation to a 4x4 matrix +// NOTE: +// Faster than creating and multiplying a scale transformation matrix. +// +__forceinline const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat ); + +// Multiply two 4x4 matrices per element +// +__forceinline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 ); + +// Compute the absolute value of a 4x4 matrix per element +// +__forceinline const Matrix4 absPerElem( const Matrix4 & mat ); + +// Transpose of a 4x4 matrix +// +__forceinline const Matrix4 transpose( const Matrix4 & mat ); + +// Compute the inverse of a 4x4 matrix +// NOTE: +// Result is unpredictable when the determinant of mat is equal to or near 0. +// +__forceinline const Matrix4 inverse( const Matrix4 & mat ); + +// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix +// NOTE: +// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions. The result is unpredictable when the determinant of mat is equal to or near 0. +// +__forceinline const Matrix4 affineInverse( const Matrix4 & mat ); + +// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix with an orthogonal upper-left 3x3 submatrix +// NOTE: +// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions. +// +__forceinline const Matrix4 orthoInverse( const Matrix4 & mat ); + +// Determinant of a 4x4 matrix +// +__forceinline const floatInVec determinant( const Matrix4 & mat ); + +// Conditionally select between two 4x4 matrices +// NOTE: +// This function uses a conditional select instruction to avoid a branch. +// However, the transfer of select1 to a VMX register may use more processing time than a branch. +// Use the boolInVec version for better performance. +// +__forceinline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 ); + +// Conditionally select between two 4x4 matrices (scalar data contained in vector data type) +// NOTE: +// This function uses a conditional select instruction to avoid a branch. +// +__forceinline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 ); + +#ifdef _VECTORMATH_DEBUG + +// Print a 4x4 matrix +// NOTE: +// Function is only defined when _VECTORMATH_DEBUG is defined. +// +__forceinline void print( const Matrix4 & mat ); + +// Print a 4x4 matrix and an associated string identifier +// NOTE: +// Function is only defined when _VECTORMATH_DEBUG is defined. +// +__forceinline void print( const Matrix4 & mat, const char * name ); + +#endif + +// A 3x4 transformation matrix in array-of-structures format +// +class Transform3 +{ + Vector3 mCol0; + Vector3 mCol1; + Vector3 mCol2; + Vector3 mCol3; + +public: + // Default constructor; does no initialization + // + __forceinline Transform3( ) { }; + + // Copy a 3x4 transformation matrix + // + __forceinline Transform3( const Transform3 & tfrm ); + + // Construct a 3x4 transformation matrix containing the specified columns + // + __forceinline Transform3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2, const Vector3 &col3 ); + + // Construct a 3x4 transformation matrix from a 3x3 matrix and a 3-D vector + // + __forceinline Transform3( const Matrix3 & tfrm, const Vector3 &translateVec ); + + // Construct a 3x4 transformation matrix from a unit-length quaternion and a 3-D vector + // + __forceinline Transform3( const Quat &unitQuat, const Vector3 &translateVec ); + + // Set all elements of a 3x4 transformation matrix to the same scalar value + // + explicit __forceinline Transform3( float scalar ); + + // Set all elements of a 3x4 transformation matrix to the same scalar value (scalar data contained in vector data type) + // + explicit __forceinline Transform3( const floatInVec &scalar ); + + // Assign one 3x4 transformation matrix to another + // + __forceinline Transform3 & operator =( const Transform3 & tfrm ); + + // Set the upper-left 3x3 submatrix + // + __forceinline Transform3 & setUpper3x3( const Matrix3 & mat3 ); + + // Get the upper-left 3x3 submatrix of a 3x4 transformation matrix + // + __forceinline const Matrix3 getUpper3x3( ) const; + + // Set translation component + // + __forceinline Transform3 & setTranslation( const Vector3 &translateVec ); + + // Get the translation component of a 3x4 transformation matrix + // + __forceinline const Vector3 getTranslation( ) const; + + // Set column 0 of a 3x4 transformation matrix + // + __forceinline Transform3 & setCol0( const Vector3 &col0 ); + + // Set column 1 of a 3x4 transformation matrix + // + __forceinline Transform3 & setCol1( const Vector3 &col1 ); + + // Set column 2 of a 3x4 transformation matrix + // + __forceinline Transform3 & setCol2( const Vector3 &col2 ); + + // Set column 3 of a 3x4 transformation matrix + // + __forceinline Transform3 & setCol3( const Vector3 &col3 ); + + // Get column 0 of a 3x4 transformation matrix + // + __forceinline const Vector3 getCol0( ) const; + + // Get column 1 of a 3x4 transformation matrix + // + __forceinline const Vector3 getCol1( ) const; + + // Get column 2 of a 3x4 transformation matrix + // + __forceinline const Vector3 getCol2( ) const; + + // Get column 3 of a 3x4 transformation matrix + // + __forceinline const Vector3 getCol3( ) const; + + // Set the column of a 3x4 transformation matrix referred to by the specified index + // + __forceinline Transform3 & setCol( int col, const Vector3 &vec ); + + // Set the row of a 3x4 transformation matrix referred to by the specified index + // + __forceinline Transform3 & setRow( int row, const Vector4 &vec ); + + // Get the column of a 3x4 transformation matrix referred to by the specified index + // + __forceinline const Vector3 getCol( int col ) const; + + // Get the row of a 3x4 transformation matrix referred to by the specified index + // + __forceinline const Vector4 getRow( int row ) const; + + // Subscripting operator to set or get a column + // + __forceinline Vector3 & operator []( int col ); + + // Subscripting operator to get a column + // + __forceinline const Vector3 operator []( int col ) const; + + // Set the element of a 3x4 transformation matrix referred to by column and row indices + // + __forceinline Transform3 & setElem( int col, int row, float val ); + + // Set the element of a 3x4 transformation matrix referred to by column and row indices (scalar data contained in vector data type) + // + __forceinline Transform3 & setElem( int col, int row, const floatInVec &val ); + + // Get the element of a 3x4 transformation matrix referred to by column and row indices + // + __forceinline const floatInVec getElem( int col, int row ) const; + + // Multiply a 3x4 transformation matrix by a 3-D vector + // + __forceinline const Vector3 operator *( const Vector3 &vec ) const; + + // Multiply a 3x4 transformation matrix by a 3-D point + // + __forceinline const Point3 operator *( const Point3 &pnt ) const; + + // Multiply two 3x4 transformation matrices + // + __forceinline const Transform3 operator *( const Transform3 & tfrm ) const; + + // Perform compound assignment and multiplication by a 3x4 transformation matrix + // + __forceinline Transform3 & operator *=( const Transform3 & tfrm ); + + // Construct an identity 3x4 transformation matrix + // + static __forceinline const Transform3 identity( ); + + // Construct a 3x4 transformation matrix to rotate around the x axis + // + static __forceinline const Transform3 rotationX( float radians ); + + // Construct a 3x4 transformation matrix to rotate around the y axis + // + static __forceinline const Transform3 rotationY( float radians ); + + // Construct a 3x4 transformation matrix to rotate around the z axis + // + static __forceinline const Transform3 rotationZ( float radians ); + + // Construct a 3x4 transformation matrix to rotate around the x axis (scalar data contained in vector data type) + // + static __forceinline const Transform3 rotationX( const floatInVec &radians ); + + // Construct a 3x4 transformation matrix to rotate around the y axis (scalar data contained in vector data type) + // + static __forceinline const Transform3 rotationY( const floatInVec &radians ); + + // Construct a 3x4 transformation matrix to rotate around the z axis (scalar data contained in vector data type) + // + static __forceinline const Transform3 rotationZ( const floatInVec &radians ); + + // Construct a 3x4 transformation matrix to rotate around the x, y, and z axes + // + static __forceinline const Transform3 rotationZYX( const Vector3 &radiansXYZ ); + + // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector + // + static __forceinline const Transform3 rotation( float radians, const Vector3 &unitVec ); + + // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type) + // + static __forceinline const Transform3 rotation( const floatInVec &radians, const Vector3 &unitVec ); + + // Construct a rotation matrix from a unit-length quaternion + // + static __forceinline const Transform3 rotation( const Quat &unitQuat ); + + // Construct a 3x4 transformation matrix to perform scaling + // + static __forceinline const Transform3 scale( const Vector3 &scaleVec ); + + // Construct a 3x4 transformation matrix to perform translation + // + static __forceinline const Transform3 translation( const Vector3 &translateVec ); + +}; +// Append (post-multiply) a scale transformation to a 3x4 transformation matrix +// NOTE: +// Faster than creating and multiplying a scale transformation matrix. +// +__forceinline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec ); + +// Prepend (pre-multiply) a scale transformation to a 3x4 transformation matrix +// NOTE: +// Faster than creating and multiplying a scale transformation matrix. +// +__forceinline const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm ); + +// Multiply two 3x4 transformation matrices per element +// +__forceinline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 ); + +// Compute the absolute value of a 3x4 transformation matrix per element +// +__forceinline const Transform3 absPerElem( const Transform3 & tfrm ); + +// Inverse of a 3x4 transformation matrix +// NOTE: +// Result is unpredictable when the determinant of the left 3x3 submatrix is equal to or near 0. +// +__forceinline const Transform3 inverse( const Transform3 & tfrm ); + +// Compute the inverse of a 3x4 transformation matrix, expected to have an orthogonal upper-left 3x3 submatrix +// NOTE: +// This can be used to achieve better performance than a general inverse when the specified 3x4 transformation matrix meets the given restrictions. +// +__forceinline const Transform3 orthoInverse( const Transform3 & tfrm ); + +// Conditionally select between two 3x4 transformation matrices +// NOTE: +// This function uses a conditional select instruction to avoid a branch. +// However, the transfer of select1 to a VMX register may use more processing time than a branch. +// Use the boolInVec version for better performance. +// +__forceinline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 ); + +// Conditionally select between two 3x4 transformation matrices (scalar data contained in vector data type) +// NOTE: +// This function uses a conditional select instruction to avoid a branch. +// +__forceinline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 ); + +#ifdef _VECTORMATH_DEBUG + +// Print a 3x4 transformation matrix +// NOTE: +// Function is only defined when _VECTORMATH_DEBUG is defined. +// +__forceinline void print( const Transform3 & tfrm ); + +// Print a 3x4 transformation matrix and an associated string identifier +// NOTE: +// Function is only defined when _VECTORMATH_DEBUG is defined. +// +__forceinline void print( const Transform3 & tfrm, const char * name ); + +#endif + +} // namespace Aos +} // namespace Vectormath + +#include "vec_aos.h" +#include "quat_aos.h" +#include "mat_aos.h" + +#endif diff --git a/src/BulletMultiThreaded/vectormath2bullet.h b/src/BulletMultiThreaded/vectormath2bullet.h index 5a4944a55..df0f688d4 100644 --- a/src/BulletMultiThreaded/vectormath2bullet.h +++ b/src/BulletMultiThreaded/vectormath2bullet.h @@ -31,12 +31,7 @@ #define AOS_VECTORMATH_BULLET_CONVERT_H -///only use a system-wide vectormath_aos.h on CELLOS_LV2 or if USE_SYSTEM_VECTORMATH -#if defined(__CELLOS_LV2__) || defined (USE_SYSTEM_VECTORMATH) -#include -#else -#include "BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h" -#endif +#include "vectormath_aos.h" #include "LinearMath/btVector3.h" #include "LinearMath/btQuaternion.h" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6c26661a8..d70e6004c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,8 +1,4 @@ -if (CMAKE_SIZEOF_VOID_P MATCHES "8") -SUBDIRS( BulletSoftBody BulletCollision BulletDynamics LinearMath ) -else (CMAKE_SIZEOF_VOID_P MATCHES "8") SUBDIRS( BulletMultiThreaded BulletSoftBody BulletCollision BulletDynamics LinearMath ) -endif (CMAKE_SIZEOF_VOID_P MATCHES "8") IF(INSTALL_LIBS) #INSTALL of other files requires CMake 2.6 diff --git a/src/LinearMath/btMinMax.h b/src/LinearMath/btMinMax.h index 33ea10ce3..80601c1e2 100644 --- a/src/LinearMath/btMinMax.h +++ b/src/LinearMath/btMinMax.h @@ -17,7 +17,7 @@ subject to the following restrictions: #ifndef GEN_MINMAX_H #define GEN_MINMAX_H -#include "btScalar.h" +#include "LinearMath/btScalar.h" template SIMD_FORCE_INLINE const T& btMin(const T& a, const T& b) @@ -32,7 +32,7 @@ SIMD_FORCE_INLINE const T& btMax(const T& a, const T& b) } template -SIMD_FORCE_INLINE const T& GEN_clamped(const T& a, const T& lb, const T& ub) +SIMD_FORCE_INLINE const T& btClamped(const T& a, const T& lb, const T& ub) { return a < lb ? lb : (ub < a ? ub : a); } @@ -56,7 +56,7 @@ SIMD_FORCE_INLINE void btSetMax(T& a, const T& b) } template -SIMD_FORCE_INLINE void GEN_clamp(T& a, const T& lb, const T& ub) +SIMD_FORCE_INLINE void btClamp(T& a, const T& lb, const T& ub) { if (a < lb) { diff --git a/src/LinearMath/btPoolAllocator.h b/src/LinearMath/btPoolAllocator.h index 39d2559c7..e3b4be9b1 100755 --- a/src/LinearMath/btPoolAllocator.h +++ b/src/LinearMath/btPoolAllocator.h @@ -57,6 +57,11 @@ public: return m_freeCount; } + int getUsedCount() const + { + return m_maxElements - m_freeCount; + } + void* allocate(int size) { // release mode fix @@ -96,6 +101,15 @@ public: return m_elemSize; } + unsigned char* getPoolAddress() + { + return m_pool; + } + + const unsigned char* getPoolAddress() const + { + return m_pool; + } }; diff --git a/src/LinearMath/btScalar.h b/src/LinearMath/btScalar.h index 9f6ceb3e0..f5694e0af 100644 --- a/src/LinearMath/btScalar.h +++ b/src/LinearMath/btScalar.h @@ -93,7 +93,7 @@ inline int btGetVersion() #else #if defined (__CELLOS_LV2__) - #define SIMD_FORCE_INLINE inline + #define SIMD_FORCE_INLINE inline __attribute__((always_inline)) #define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16))) #define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64))) #define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128))) @@ -101,7 +101,14 @@ inline int btGetVersion() #include #endif #ifdef BT_DEBUG - #define btAssert assert +#ifdef __SPU__ +#include +#define printf spu_printf + #define btAssert(x) {if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);spu_hcmpeq(0,0);}} +#else + #define btAssert assert +#endif + #else #define btAssert(x) #endif diff --git a/src/LinearMath/btVector3.h b/src/LinearMath/btVector3.h index 84446f229..f1f0be45c 100644 --- a/src/LinearMath/btVector3.h +++ b/src/LinearMath/btVector3.h @@ -667,23 +667,32 @@ SIMD_FORCE_INLINE void btUnSwapVector3Endian(btVector3& vector) vector = swappedVec; } -SIMD_FORCE_INLINE void btPlaneSpace1 (const btVector3& n, btVector3& p, btVector3& q) +template +SIMD_FORCE_INLINE void btPlaneSpace1 (const T& n, T& p, T& q) { - if (btFabs(n.z()) > SIMDSQRT12) { + if (btFabs(n[2]) > SIMDSQRT12) { // choose p in y-z plane btScalar a = n[1]*n[1] + n[2]*n[2]; btScalar k = btRecipSqrt (a); - p.setValue(0,-n[2]*k,n[1]*k); + p[0] = 0; + p[1] = -n[2]*k; + p[2] = n[1]*k; // set q = n x p - q.setValue(a*k,-n[0]*p[2],n[0]*p[1]); + q[0] = a*k; + q[1] = -n[0]*p[2]; + q[2] = n[0]*p[1]; } else { // choose p in x-y plane - btScalar a = n.x()*n.x() + n.y()*n.y(); + btScalar a = n[0]*n[0] + n[1]*n[1]; btScalar k = btRecipSqrt (a); - p.setValue(-n.y()*k,n.x()*k,0); + p[0] = -n[1]*k; + p[1] = n[0]*k; + p[2] = 0; // set q = n x p - q.setValue(-n.z()*p.y(),n.z()*p.x(),a*k); + q[0] = -n[2]*p[1]; + q[1] = n[2]*p[0]; + q[2] = a*k; } } diff --git a/src/MiniCL/cl_MiniCL_Defs.h b/src/MiniCL/cl_MiniCL_Defs.h index ffdac1026..285f48b64 100644 --- a/src/MiniCL/cl_MiniCL_Defs.h +++ b/src/MiniCL/cl_MiniCL_Defs.h @@ -22,7 +22,7 @@ subject to the following restrictions: #define __kernel #define __global -#define __local +#define __local static #define get_global_id(a) __guid_arg #define get_local_id(a) ((__guid_arg) % gMiniCLNumOutstandingTasks) #define get_local_size(a) (gMiniCLNumOutstandingTasks)