From fbc17731ec66da9626b52c7fe8830a03faf4049e Mon Sep 17 00:00:00 2001
From: "erwin.coumans" <erwin.coumans@08e121b0-ae19-0410-a57b-3be3395fd4fd>
Date: Thu, 8 Jul 2010 17:02:38 +0000
Subject: [PATCH] Several changes to sync Bullet trunk with PlayStation 3
 spubullet version

Still needs some cross-platform fixes
---
 CMakeLists.txt                                |   13 +-
 Demos/AllBulletDemos/CMakeLists.txt           |    3 +-
 Demos/BasicDemo/BasicDemo.cpp                 |   50 +-
 Demos/BasicDemo/CMakeLists.txt                |    3 +-
 Demos/Benchmarks/BenchmarkDemo.cpp            |    3 +
 Demos/Benchmarks/CMakeLists.txt               |    6 +-
 Demos/Box2dDemo/CMakeLists.txt                |  142 +-
 Demos/CMakeLists.txt                          |    5 -
 .../ConcaveRaycastDemo/ConcaveRaycastDemo.cpp |    3 +-
 Demos/MultiThreadedDemo/CMakeLists.txt        |   80 +-
 Demos/MultiThreadedDemo/MultiThreadedDemo.cpp |   76 +-
 Demos/OpenGL/DemoApplication.cpp              |   11 +-
 Demos/ParticlesOpenCL/NVidia/CMakeLists.txt   |   14 +-
 Demos/ParticlesOpenCL/ParticlesDemo.cpp       |   69 +-
 Demos/ParticlesOpenCL/ParticlesOCL.cl         |   11 +-
 .../btParticlesDemoDynamicsWorld.cpp          |  131 +-
 .../btParticlesDynamicsWorld.h                |    1 +
 Demos/ParticlesOpenCL/shaders.cpp             |   35 +-
 Demos/SharedOpenCL/btOclUtils.cpp             |    1 +
 Demos/ThreadingDemo/main.cpp                  |    4 +-
 Demos/VectorAdd_OpenCL/MiniCL_VectorAdd.cpp   |    2 +-
 Demos/VectorAdd_OpenCL/NVidia/CMakeLists.txt  |   12 +-
 .../NarrowPhaseCollision/btManifoldPoint.h    |   19 +-
 .../btPersistentManifold.h                    |   29 +-
 .../ConstraintSolver/btSolverConstraint.h     |    4 +-
 src/BulletDynamics/Dynamics/btRigidBody.cpp   |    4 +-
 src/BulletMultiThreaded/CMakeLists.txt        |    7 +-
 src/BulletMultiThreaded/HeapManager.h         |  116 +
 src/BulletMultiThreaded/MiniCL.cpp            |   13 +-
 src/BulletMultiThreaded/PlatformDefinitions.h |   28 +-
 src/BulletMultiThreaded/PpuAddressSpace.h     |    5 +-
 .../SequentialThreadSupport.cpp               |   76 +
 .../SequentialThreadSupport.h                 |    6 +-
 src/BulletMultiThreaded/SpuFakeDma.cpp        |    4 +
 .../SpuNarrowPhaseCollisionTask/Box.h         |   32 +-
 .../SpuContactResult.cpp                      |    2 +-
 .../SpuGatheringCollisionTask.cpp             |   60 +-
 .../boxBoxDistance.cpp                        |  345 +--
 .../boxBoxDistance.h                          |   11 +-
 src/BulletMultiThreaded/TrbDynBody.h          |   78 +
 src/BulletMultiThreaded/TrbStateVec.h         |  334 +++
 .../btParallelConstraintSolver.cpp            | 1341 ++++++++-
 .../btParallelConstraintSolver.h              |  230 +-
 .../vectormath/scalar/{cpp => }/boolInVec.h   |    0
 .../vectormath/scalar/{cpp => }/floatInVec.h  |    0
 .../vectormath/scalar/{cpp => }/mat_aos.h     |    0
 .../vectormath/scalar/{cpp => }/quat_aos.h    |    0
 .../vectormath/scalar/{cpp => }/vec_aos.h     |    0
 .../scalar/{cpp => }/vectormath_aos.h         |    0
 .../vectormath/sse/boolInVec.h                |  247 ++
 .../vectormath/sse/floatInVec.h               |  340 +++
 .../vectormath/sse/mat_aos.h                  | 2190 ++++++++++++++
 .../vectormath/sse/quat_aos.h                 |  579 ++++
 .../vectormath/sse/vec_aos.h                  | 1431 ++++++++++
 .../vectormath/sse/vecidx_aos.h               |   80 +
 .../vectormath/sse/vectormath_aos.h           | 2532 +++++++++++++++++
 src/BulletMultiThreaded/vectormath2bullet.h   |    7 +-
 src/CMakeLists.txt                            |    4 -
 src/LinearMath/btMinMax.h                     |    6 +-
 src/LinearMath/btPoolAllocator.h              |   14 +
 src/LinearMath/btScalar.h                     |   11 +-
 src/LinearMath/btVector3.h                    |   23 +-
 src/MiniCL/cl_MiniCL_Defs.h                   |    2 +-
 63 files changed, 10363 insertions(+), 522 deletions(-)
 create mode 100644 src/BulletMultiThreaded/HeapManager.h
 create mode 100644 src/BulletMultiThreaded/TrbDynBody.h
 create mode 100644 src/BulletMultiThreaded/TrbStateVec.h
 rename src/BulletMultiThreaded/vectormath/scalar/{cpp => }/boolInVec.h (100%)
 rename src/BulletMultiThreaded/vectormath/scalar/{cpp => }/floatInVec.h (100%)
 rename src/BulletMultiThreaded/vectormath/scalar/{cpp => }/mat_aos.h (100%)
 rename src/BulletMultiThreaded/vectormath/scalar/{cpp => }/quat_aos.h (100%)
 rename src/BulletMultiThreaded/vectormath/scalar/{cpp => }/vec_aos.h (100%)
 rename src/BulletMultiThreaded/vectormath/scalar/{cpp => }/vectormath_aos.h (100%)
 create mode 100644 src/BulletMultiThreaded/vectormath/sse/boolInVec.h
 create mode 100644 src/BulletMultiThreaded/vectormath/sse/floatInVec.h
 create mode 100644 src/BulletMultiThreaded/vectormath/sse/mat_aos.h
 create mode 100644 src/BulletMultiThreaded/vectormath/sse/quat_aos.h
 create mode 100644 src/BulletMultiThreaded/vectormath/sse/vec_aos.h
 create mode 100644 src/BulletMultiThreaded/vectormath/sse/vecidx_aos.h
 create mode 100644 src/BulletMultiThreaded/vectormath/sse/vectormath_aos.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9cd0b9183..49143982f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
-cmake_minimum_required(VERSION 2.4)
-
+cmake_minimum_required(VERSION 2.4.3)
+set(CMAKE_ALLOW_LOOSE_LOOP_CONSTRUCTS true)
 
 #this line has to appear before 'PROJECT' in order to be able to disable incremental linking
 SET(MSVC_INCREMENTAL_DEFAULT ON)
@@ -12,10 +12,8 @@ IF (NOT CMAKE_BUILD_TYPE)
  SET(CMAKE_BUILD_TYPE "Release")
 ENDIF (NOT CMAKE_BUILD_TYPE) 
 
-SET(CMAKE_CXX_FLAGS_DEBUG  "-O0 -g -DDEBUG")
+SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_DEBUG")
 
-SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_DEBUG=3")
-SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -D_DEBUG=4")
 MESSAGE("CMAKE_CXX_FLAGS_DEBUG="+${CMAKE_CXX_FLAGS_DEBUG})
 OPTION(USE_DOUBLE_PRECISION "Use double precision"	OFF)
 OPTION(USE_GRAPHICAL_BENCHMARK "Use Graphical Benchmark" ON)
@@ -28,6 +26,11 @@ ENDIF()
 OPTION(USE_MSVC_RUNTIME_LIBRARY_DLL "Use MSVC Runtime Library DLL (/MD or /MDd)" ON)
 OPTION(USE_MSVC_INCREMENTAL_LINKING "Use MSVC Incremental Linking" OFF)
 
+IF(WIN32)
+SET (VECTOR_MATH_INCLUDE ${BULLET_PHYSICS_SOURCE_DIR}/src/BulletMultiThreaded/vectormath/sse CACHE PATH "Vector Math library include path.")
+ELSE(WIN32)
+SET (VECTOR_MATH_INCLUDE ${BULLET_PHYSICS_SOURCE_DIR}/src/BulletMultiThreaded/vectormath/scalar CACHE PATH "Vector Math library include path.")
+ENDIF(WIN32)
 
 
 #SET(CMAKE_EXE_LINKER_FLAGS_INIT    "/STACK:10000000 /INCREMENTAL:NO")	
diff --git a/Demos/AllBulletDemos/CMakeLists.txt b/Demos/AllBulletDemos/CMakeLists.txt
index a8bcdbdb1..081c9d661 100644
--- a/Demos/AllBulletDemos/CMakeLists.txt
+++ b/Demos/AllBulletDemos/CMakeLists.txt
@@ -20,10 +20,11 @@ ${BULLET_PHYSICS_SOURCE_DIR}/src
 ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL 
 ${BULLET_PHYSICS_SOURCE_DIR}/Extras/Serialize/BulletFileLoader
 ${BULLET_PHYSICS_SOURCE_DIR}/Extras/Serialize/BulletWorldImporter
+${VECTOR_MATH_INCLUDE}
 )
 
 LINK_LIBRARIES(
-GLUI  GIMPACTUtils ConvexDecomposition OpenGLSupport  BulletWorldImporter BulletSoftBody BulletDynamics BulletCollision BulletFileLoader LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
+GLUI  GIMPACTUtils ConvexDecomposition BulletMultiThreaded OpenGLSupport  BulletWorldImporter BulletSoftBody BulletDynamics BulletCollision BulletFileLoader LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
 )
 
 SET(AllBulletDemos_SRCS
diff --git a/Demos/BasicDemo/BasicDemo.cpp b/Demos/BasicDemo/BasicDemo.cpp
index e945074e1..50566d13c 100644
--- a/Demos/BasicDemo/BasicDemo.cpp
+++ b/Demos/BasicDemo/BasicDemo.cpp
@@ -33,6 +33,11 @@ subject to the following restrictions:
 #include "GlutStuff.h"
 ///btBulletDynamicsCommon.h is the main Bullet include file, contains most common include files.
 #include "btBulletDynamicsCommon.h"
+
+#include "BulletMultiThreaded/Win32ThreadSupport.h"
+#include "BulletMultiThreaded/btParallelConstraintSolver.h"
+#include "BulletCollision/CollisionDispatch/btSimulationIslandManager.h"
+
 #ifdef TEST_SERIALIZATION
 #include "LinearMath/btSerializer.h"
 #endif //TEST_SERIALIZATION
@@ -50,7 +55,7 @@ void BasicDemo::clientMoveAndDisplay()
 	///step the simulation
 	if (m_dynamicsWorld)
 	{
-		m_dynamicsWorld->stepSimulation(ms / 1000000.f);
+		m_dynamicsWorld->stepSimulation(1./60.,0);//ms / 1000000.f,0);
 		//optional but useful: debug drawing
 		m_dynamicsWorld->debugDrawWorld();
 	}
@@ -79,7 +84,35 @@ void BasicDemo::displayCallback(void) {
 	swapBuffers();
 }
 
+btThreadSupportInterface* createSolverThreadSupport(int maxNumThreads)
+{
+//#define SEQUENTIAL
+#ifdef SEQUENTIAL
+	SequentialThreadSupport::SequentialThreadConstructionInfo tci("solverThreads",SolverThreadFunc,SolverlsMemoryFunc);
+	SequentialThreadSupport* threadSupport = new SequentialThreadSupport(tci);
+	threadSupport->startSPU();
+#else
 
+#ifdef _WIN32
+	Win32ThreadSupport::Win32ThreadConstructionInfo threadConstructionInfo("solverThreads",SolverThreadFunc,SolverlsMemoryFunc,maxNumThreads);
+	Win32ThreadSupport* threadSupport = new Win32ThreadSupport(threadConstructionInfo);
+	threadSupport->startSPU();
+#elif defined (USE_PTHREADS)
+	PosixThreadSupport::ThreadConstructionInfo solverConstructionInfo("solver", SolverThreadFunc,
+																	  SolverlsMemoryFunc, maxNumThreads);
+	
+	PosixThreadSupport* threadSupport = new PosixThreadSupport(solverConstructionInfo);
+	
+#else
+	SequentialThreadSupport::SequentialThreadConstructionInfo tci("solverThreads",SolverThreadFunc,SolverlsMemoryFunc);
+	SequentialThreadSupport* threadSupport = new SequentialThreadSupport(tci);
+	threadSupport->startSPU();
+#endif
+	
+#endif
+
+	return threadSupport;
+}
 
 
 
@@ -100,10 +133,21 @@ void	BasicDemo::initPhysics()
 	m_broadphase = new btDbvtBroadphase();
 
 	///the default constraint solver. For parallel processing you can use a different solver (see Extras/BulletMultiThreaded)
-	btSequentialImpulseConstraintSolver* sol = new btSequentialImpulseConstraintSolver;
+	//btSequentialImpulseConstraintSolver* sol = new btSequentialImpulseConstraintSolver;
+
+	//btSequentialImpulseConstraintSolver* sol = new btSequentialImpulseConstraintSolver;
+	btParallelConstraintSolver* sol = new btParallelConstraintSolver(createSolverThreadSupport(4));
+	
+
+
+
 	m_solver = sol;
 
-	m_dynamicsWorld = new btDiscreteDynamicsWorld(m_dispatcher,m_broadphase,m_solver,m_collisionConfiguration);
+	btDiscreteDynamicsWorld* world =  new btDiscreteDynamicsWorld(m_dispatcher,m_broadphase,m_solver,m_collisionConfiguration);
+	
+	m_dynamicsWorld = world;
+	world->getSimulationIslandManager()->setSplitIslands(false);
+	
 
 	m_dynamicsWorld->setGravity(btVector3(0,-10,0));
 
diff --git a/Demos/BasicDemo/CMakeLists.txt b/Demos/BasicDemo/CMakeLists.txt
index 09568037b..d00423382 100644
--- a/Demos/BasicDemo/CMakeLists.txt
+++ b/Demos/BasicDemo/CMakeLists.txt
@@ -13,13 +13,14 @@ SET(GLUT_ROOT ${BULLET_PHYSICS_SOURCE_DIR}/Glut)
 
 INCLUDE_DIRECTORIES(
 ${BULLET_PHYSICS_SOURCE_DIR}/src ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL 
+${VECTOR_MATH_INCLUDE}
 )
 
 
 
 IF (USE_GLUT)
 	LINK_LIBRARIES(
-	OpenGLSupport  BulletDynamics  BulletCollision LinearMath  ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
+	OpenGLSupport  BulletMultiThreaded BulletDynamics  BulletCollision LinearMath  ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
 	)
 
 IF (WIN32)
diff --git a/Demos/Benchmarks/BenchmarkDemo.cpp b/Demos/Benchmarks/BenchmarkDemo.cpp
index a6e313b90..194e38577 100644
--- a/Demos/Benchmarks/BenchmarkDemo.cpp
+++ b/Demos/Benchmarks/BenchmarkDemo.cpp
@@ -29,9 +29,12 @@ subject to the following restrictions:
 #include "Taru.mdl"
 #include "landscape.mdl"
 #include "BulletCollision/BroadphaseCollision/btDbvtBroadphase.h"
+#ifdef USE_PARALLEL_DISPATCHER_BENCHMARK
 #include "BulletMultiThreaded/SpuGatheringCollisionDispatcher.h"
 #include "BulletMultiThreaded/SequentialThreadSupport.h"
 #include "BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h"
+#endif
+
 #include "BulletCollision/CollisionDispatch/btSimulationIslandManager.h"
 
 
diff --git a/Demos/Benchmarks/CMakeLists.txt b/Demos/Benchmarks/CMakeLists.txt
index ab4277875..bcd771fef 100644
--- a/Demos/Benchmarks/CMakeLists.txt
+++ b/Demos/Benchmarks/CMakeLists.txt
@@ -11,11 +11,12 @@
 IF (USE_GRAPHICAL_BENCHMARK)
 IF (USE_GLUT)
 	INCLUDE_DIRECTORIES(
-	${BULLET_PHYSICS_SOURCE_DIR}/src 
-	${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL 
+		${BULLET_PHYSICS_SOURCE_DIR}/src 
+		${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL 
 	)
 
 	IF (USE_MULTITHREADED_BENCHMARK)
+	INCLUDE_DIRECTORIES( ${VECTOR_MATH_INCLUDE} )
 		LINK_LIBRARIES(	OpenGLSupport BulletMultiThreaded BulletDynamics  BulletCollision LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY})
 	ELSE()
 		LINK_LIBRARIES(	OpenGLSupport BulletDynamics  BulletCollision LinearMath ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY})
@@ -65,6 +66,7 @@ ELSE (USE_GLUT)
 	)
 
 	IF (USE_MULTITHREADED_BENCHMARK)
+		INCLUDE_DIRECTORIES( ${VECTOR_MATH_INCLUDE} )
 		LINK_LIBRARIES(
 		OpenGLSupport BulletMultiThreaded BulletDynamics  BulletCollision LinearMath ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
 		)
diff --git a/Demos/Box2dDemo/CMakeLists.txt b/Demos/Box2dDemo/CMakeLists.txt
index ccde7c810..0b434f652 100644
--- a/Demos/Box2dDemo/CMakeLists.txt
+++ b/Demos/Box2dDemo/CMakeLists.txt
@@ -1,75 +1,75 @@
-# This is basically the overall name of the project in Visual Studio this is the name of the Solution File
-
-
-# For every executable you have with a main method you should have an add_executable line below.
-# For every add executable line you should list every .cpp and .h file you have associated with that executable.  
-
-
-# This is the variable for Windows.  I use this to define the root of my directory structure.
-SET(GLUT_ROOT ${BULLET_PHYSICS_SOURCE_DIR}/Glut)
-
-# You shouldn't have to modify anything below this line 
-########################################################
-
-INCLUDE_DIRECTORIES(
-${BULLET_PHYSICS_SOURCE_DIR}/src ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL 
-)
-
-IF (USE_GLUT)
-	LINK_LIBRARIES(
-		OpenGLSupport  BulletDynamics  BulletCollision LinearMath    ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
-	)
-
-	IF (WIN32)
-		ADD_EXECUTABLE(AppBox2dDemo
-			main.cpp
-			Box2dDemo.cpp
-			Box2dDemo.h
-			${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
-		)
-	ELSE()
-		ADD_EXECUTABLE(AppBox2dDemo
-			main.cpp
-			Box2dDemo.cpp
-			Box2dDemo.h
-		)
-	ENDIF()
-	IF (WIN32)
-		IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
-			IF (CMAKE_CL_64)
-				ADD_CUSTOM_COMMAND(
-					TARGET AppBox2dDemo
-					POST_BUILD
-					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR}
-				)
-			ELSE(CMAKE_CL_64)
-				ADD_CUSTOM_COMMAND(
-					TARGET AppBox2dDemo
-					POST_BUILD
-					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR}
-				)
-			ENDIF(CMAKE_CL_64)
-		ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
-		
-	ENDIF(WIN32)
-ELSE (USE_GLUT)
-	LINK_LIBRARIES(
-		OpenGLSupport  BulletDynamics  BulletCollision LinearMath  ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
-	)
-
-	ADD_EXECUTABLE(AppBox2dDemo
-		WIN32
-		../OpenGL/Win32AppMain.cpp
-		Win32Box2dDemo.cpp
-		Box2dDemo.cpp
-		Box2dDemo.h
-		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
-	)
-ENDIF (USE_GLUT)
-
+# This is basically the overall name of the project in Visual Studio this is the name of the Solution File
+
+
+# For every executable you have with a main method you should have an add_executable line below.
+# For every add executable line you should list every .cpp and .h file you have associated with that executable.  
+
+
+# This is the variable for Windows.  I use this to define the root of my directory structure.
+SET(GLUT_ROOT ${BULLET_PHYSICS_SOURCE_DIR}/Glut)
+
+# You shouldn't have to modify anything below this line 
+########################################################
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL 
+)
+
+IF (USE_GLUT)
+	LINK_LIBRARIES(
+		OpenGLSupport  BulletDynamics  BulletCollision LinearMath    ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
+	)
+
+	IF (WIN32)
+		ADD_EXECUTABLE(AppBox2dDemo
+			main.cpp
+			Box2dDemo.cpp
+			Box2dDemo.h
+			${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+		)
+	ELSE()
+		ADD_EXECUTABLE(AppBox2dDemo
+			main.cpp
+			Box2dDemo.cpp
+			Box2dDemo.h
+		)
+	ENDIF()
+	IF (WIN32)
+		IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+			IF (CMAKE_CL_64)
+				ADD_CUSTOM_COMMAND(
+					TARGET AppBox2dDemo
+					POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR}
+				)
+			ELSE(CMAKE_CL_64)
+				ADD_CUSTOM_COMMAND(
+					TARGET AppBox2dDemo
+					POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR}
+				)
+			ENDIF(CMAKE_CL_64)
+		ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		
+	ENDIF(WIN32)
+ELSE (USE_GLUT)
+	LINK_LIBRARIES(
+		OpenGLSupport  BulletDynamics  BulletCollision LinearMath  ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
+	)
+
+	ADD_EXECUTABLE(AppBox2dDemo
+		WIN32
+		../OpenGL/Win32AppMain.cpp
+		Win32Box2dDemo.cpp
+		Box2dDemo.cpp
+		Box2dDemo.h
+		${BULLET_PHYSICS_SOURCE_DIR}/msvc/bullet.rc
+	)
+ENDIF (USE_GLUT)
+
 IF (INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES)
 			SET_TARGET_PROPERTIES(AppBox2dDemo PROPERTIES  DEBUG_POSTFIX "_Debug")
 			SET_TARGET_PROPERTIES(AppBox2dDemo PROPERTIES  MINSIZEREL_POSTFIX "_MinsizeRel")
 			SET_TARGET_PROPERTIES(AppBox2dDemo PROPERTIES  RELWITHDEBINFO_POSTFIX "_RelWithDebugInfo")
-ENDIF(INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES)
-
+ENDIF(INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES)
+
diff --git a/Demos/CMakeLists.txt b/Demos/CMakeLists.txt
index 450d1f783..979bc8180 100644
--- a/Demos/CMakeLists.txt
+++ b/Demos/CMakeLists.txt
@@ -18,17 +18,12 @@ ELSE()
 	)
 ENDIF()
 
-if (CMAKE_SIZEOF_VOID_P MATCHES "8")
-	SUBDIRS( 	${SharedDemoSubdirs}
-	)
-else (CMAKE_SIZEOF_VOID_P MATCHES "8")
 	SUBDIRS( 		${SharedDemoSubdirs}
 		ThreadingDemo
 		MultiThreadedDemo
 		VectorAdd_OpenCL
 		ParticlesOpenCL
 		)
-endif (CMAKE_SIZEOF_VOID_P MATCHES "8")
 
 ELSE (USE_GLUT)
 
diff --git a/Demos/ConcaveRaycastDemo/ConcaveRaycastDemo.cpp b/Demos/ConcaveRaycastDemo/ConcaveRaycastDemo.cpp
index d1abd1c0c..c272517ba 100644
--- a/Demos/ConcaveRaycastDemo/ConcaveRaycastDemo.cpp
+++ b/Demos/ConcaveRaycastDemo/ConcaveRaycastDemo.cpp
@@ -25,7 +25,7 @@ subject to the following restrictions:
 #ifdef BATCH_RAYCASTER
 #include "BulletMultiThreaded/SpuBatchRaycaster.h"
 static SpuBatchRaycaster* gBatchRaycaster = NULL;
-#endif
+
 
 #ifdef USE_LIBSPE2
 #include "BulletMultiThreaded/SpuLibspe2Support.h"
@@ -35,6 +35,7 @@ static SpuBatchRaycaster* gBatchRaycaster = NULL;
 //other platforms run the parallel code sequentially (until pthread support or other parallel implementation is added)
 #include "BulletMultiThreaded/SequentialThreadSupport.h"
 #endif //USE_LIBSPE2
+#endif //BATCH_RAYCASTER
 
 static btVector3*	gVertices=0;
 static int*	gIndices=0;
diff --git a/Demos/MultiThreadedDemo/CMakeLists.txt b/Demos/MultiThreadedDemo/CMakeLists.txt
index 0dbda097c..063e96ea1 100644
--- a/Demos/MultiThreadedDemo/CMakeLists.txt
+++ b/Demos/MultiThreadedDemo/CMakeLists.txt
@@ -1,33 +1,53 @@
-# This is basically the overall name of the project in Visual Studio this is the name of the Solution File
-
-
-# For every executable you have with a main method you should have an add_executable line below.
-# For every add executable line you should list every .cpp and .h file you have associated with that executable.  
-
-
-
-# You shouldn't have to modify anything below this line 
-########################################################
-
-
-INCLUDE_DIRECTORIES(
-${BULLET_PHYSICS_SOURCE_DIR}/src ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL 
-)
-
-LINK_LIBRARIES(
-OpenGLSupport   BulletMultiThreaded BulletDynamics  BulletCollision LinearMath    ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
-)
-
-ADD_EXECUTABLE(AppMultiThreadedDemo
-	main.cpp
-	MultiThreadedDemo.cpp
-	MultiThreadedDemo.h
-)
-IF (UNIX)
-  TARGET_LINK_LIBRARIES(AppMultiThreadedDemo pthread)
-ENDIF(UNIX)
-
-
+# This is basically the overall name of the project in Visual Studio this is the name of the Solution File
+
+
+# For every executable you have with a main method you should have an add_executable line below.
+# For every add executable line you should list every .cpp and .h file you have associated with that executable.  
+
+
+
+# You shouldn't have to modify anything below this line 
+########################################################
+
+
+INCLUDE_DIRECTORIES(
+${BULLET_PHYSICS_SOURCE_DIR}/src 
+${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL 
+${VECTOR_MATH_INCLUDE}
+)
+
+LINK_LIBRARIES(
+OpenGLSupport   BulletMultiThreaded BulletDynamics  BulletCollision LinearMath    ${GLUT_glut_LIBRARY} ${OPENGL_gl_LIBRARY} ${OPENGL_glu_LIBRARY}
+)
+
+ADD_EXECUTABLE(AppMultiThreadedDemo
+	main.cpp
+	MultiThreadedDemo.cpp
+	MultiThreadedDemo.h
+)
+IF (UNIX)
+  TARGET_LINK_LIBRARIES(AppMultiThreadedDemo pthread)
+ENDIF(UNIX)
+
+IF(WIN32)
+IF (CMAKE_CL_64)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		ADD_CUSTOM_COMMAND(	TARGET AppMultiThreadedDemo		POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR}	)
+		ADD_CUSTOM_COMMAND(	TARGET AppMultiThreadedDemo	POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW64.DLL ${CMAKE_CURRENT_BINARY_DIR})
+	ENDIF()
+ELSE(CMAKE_CL_64)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		ADD_CUSTOM_COMMAND(	TARGET AppMultiThreadedDemo	POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR})
+		ADD_CUSTOM_COMMAND(	TARGET AppMultiThreadedDemo	POST_BUILD
+					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW32.DLL ${CMAKE_CURRENT_BINARY_DIR})
+	ENDIF()
+ENDIF(CMAKE_CL_64)
+ENDIF(WIN32)
+
+
 IF (INTERNAL_ADD_POSTFIX_EXECUTABLE_NAMES)
 			SET_TARGET_PROPERTIES(AppMultiThreadedDemo PROPERTIES  DEBUG_POSTFIX "_Debug")
 			SET_TARGET_PROPERTIES(AppMultiThreadedDemo PROPERTIES  MINSIZEREL_POSTFIX "_MinsizeRel")
diff --git a/Demos/MultiThreadedDemo/MultiThreadedDemo.cpp b/Demos/MultiThreadedDemo/MultiThreadedDemo.cpp
index 4fff696df..5f5c437bc 100644
--- a/Demos/MultiThreadedDemo/MultiThreadedDemo.cpp
+++ b/Demos/MultiThreadedDemo/MultiThreadedDemo.cpp
@@ -13,12 +13,13 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-//#define USE_PARALLEL_SOLVER 1 //experimental parallel solver
+#define USE_PARALLEL_SOLVER 1 //experimental parallel solver
 #define USE_PARALLEL_DISPATCHER 1
 
 #include "btBulletDynamicsCommon.h"
 #include "BulletCollision/CollisionDispatch/btSphereSphereCollisionAlgorithm.h"
 #include "BulletCollision/CollisionDispatch/btSphereTriangleCollisionAlgorithm.h"
+#include "BulletCollision/CollisionDispatch/btSimulationIslandManager.h"
 
 #ifdef USE_PARALLEL_DISPATCHER
 #include "BulletMultiThreaded/SpuGatheringCollisionDispatcher.h"
@@ -37,18 +38,52 @@ subject to the following restrictions:
 
 #else
 //other platforms run the parallel code sequentially (until pthread support or other parallel implementation is added)
-#include "BulletMultiThreaded/SequentialThreadSupport.h"
+
 #include "BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h"
 #endif //USE_LIBSPE2
 
 #ifdef USE_PARALLEL_SOLVER
-#include "BulletMultiThreaded/SpuParallelSolver.h"
-#include "BulletMultiThreaded/SpuSolverTask/SpuParallellSolverTask.h"
+#include "BulletMultiThreaded/btParallelConstraintSolver.h"
+#include "BulletMultiThreaded/SequentialThreadSupport.h"
+
+
+btThreadSupportInterface* createSolverThreadSupport(int maxNumThreads)
+{
+//#define SEQUENTIAL
+#ifdef SEQUENTIAL
+	SequentialThreadSupport::SequentialThreadConstructionInfo tci("solverThreads",SolverThreadFunc,SolverlsMemoryFunc);
+	SequentialThreadSupport* threadSupport = new SequentialThreadSupport(tci);
+	threadSupport->startSPU();
+#else
+
+#ifdef _WIN32
+	Win32ThreadSupport::Win32ThreadConstructionInfo threadConstructionInfo("solverThreads",SolverThreadFunc,SolverlsMemoryFunc,maxNumThreads);
+	Win32ThreadSupport* threadSupport = new Win32ThreadSupport(threadConstructionInfo);
+	threadSupport->startSPU();
+#elif defined (USE_PTHREADS)
+	PosixThreadSupport::ThreadConstructionInfo solverConstructionInfo("solver", SolverThreadFunc,
+																	  SolverlsMemoryFunc, maxNumThreads);
+	
+	PosixThreadSupport* threadSupport = new PosixThreadSupport(solverConstructionInfo);
+	
+#else
+	SequentialThreadSupport::SequentialThreadConstructionInfo tci("solverThreads",SolverThreadFunc,SolverlsMemoryFunc);
+	SequentialThreadSupport* threadSupport = new SequentialThreadSupport(tci);
+	threadSupport->startSPU();
+#endif
+	
+#endif
+
+	return threadSupport;
+}
+
 #endif //USE_PARALLEL_SOLVER
 
 #endif//USE_PARALLEL_DISPATCHER
 
 
+
+
 #include "LinearMath/btQuickprof.h"
 #include "LinearMath/btIDebugDraw.h"
 
@@ -162,8 +197,6 @@ void MultiThreadedDemo::clientMoveAndDisplay()
 		int numSimSteps = 0;
 		numSimSteps = m_dynamicsWorld->stepSimulation(dt,maxSimSubSteps);
 		
-		//optional but useful: debug drawing
-		m_dynamicsWorld->debugDrawWorld();
 
 #ifdef VERBOSE_TIMESTEPPING_CONSOLEOUTPUT
 		if (!numSimSteps)
@@ -182,6 +215,9 @@ void MultiThreadedDemo::clientMoveAndDisplay()
 #endif //VERBOSE_TIMESTEPPING_CONSOLEOUTPUT
 
 #endif
+	   //optional but useful: debug drawing
+               m_dynamicsWorld->debugDrawWorld();
+
 	}
 	
 #ifdef USE_QUICKPROF 
@@ -321,41 +357,23 @@ m_threadSupportCollision = new Win32ThreadSupport(Win32ThreadSupport::Win32Threa
 
 	
 #ifdef USE_PARALLEL_SOLVER
-
-#ifdef USE_WIN32_THREADING
-	m_threadSupportSolver = new Win32ThreadSupport(Win32ThreadSupport::Win32ThreadConstructionInfo(
-								"solver",
-								processSolverTask,
-								createSolverLocalStoreMemory,
-								maxNumOutstandingTasks));
-#elif defined (USE_PTHREADS)
-	PosixThreadSupport::ThreadConstructionInfo solverConstructionInfo("solver", processSolverTask,
-									createSolverLocalStoreMemory, maxNumOutstandingTasks);
-
-    	m_threadSupportSolver = new PosixThreadSupport(solverConstructionInfo);
-#else
-	//for now use sequential version	
-	SequentialThreadSupport::SequentialThreadConstructionInfo solverCI("solver",processSolverTask,createSolverLocalStoreMemory);
-	m_threadSupportSolver = new SequentialThreadSupport(solverCI);
-
-#endif //USE_WIN32_THREADING
-	m_solver = new btParallelSequentialImpulseSolver(m_threadSupportSolver,maxNumOutstandingTasks);
-
+	m_threadSupportSolver = createSolverThreadSupport(maxNumOutstandingTasks);
+	m_solver = new btParallelConstraintSolver(m_threadSupportSolver);
 #else
 
 	btSequentialImpulseConstraintSolver* solver = new btSequentialImpulseConstraintSolver();
-
 	m_solver = solver;
 	//default solverMode is SOLVER_RANDMIZE_ORDER. Warmstarting seems not to improve convergence, see 
 	//solver->setSolverMode(0);//btSequentialImpulseConstraintSolver::SOLVER_USE_WARMSTARTING | btSequentialImpulseConstraintSolver::SOLVER_RANDMIZE_ORDER);
-	
 #endif //USE_PARALLEL_SOLVER
 
+	
 		btDiscreteDynamicsWorld* world = new btDiscreteDynamicsWorld(m_dispatcher,m_broadphase,m_solver,m_collisionConfiguration);
 		m_dynamicsWorld = world;
 
+		world->getSimulationIslandManager()->setSplitIslands(false);
 		world->getSolverInfo().m_numIterations = 4;
-		world->getSolverInfo().m_solverMode = SOLVER_SIMD+SOLVER_USE_WARMSTARTING;
+		world->getSolverInfo().m_solverMode = SOLVER_SIMD+SOLVER_USE_WARMSTARTING;//+SOLVER_RANDMIZE_ORDER;
 
 		m_dynamicsWorld->getDispatchInfo().m_enableSPU = true;
 		m_dynamicsWorld->setGravity(btVector3(0,-10,0));
diff --git a/Demos/OpenGL/DemoApplication.cpp b/Demos/OpenGL/DemoApplication.cpp
index 121381ca7..b09680d0a 100644
--- a/Demos/OpenGL/DemoApplication.cpp
+++ b/Demos/OpenGL/DemoApplication.cpp
@@ -954,8 +954,8 @@ void	DemoApplication::mouseMotionFunc(int x,int y)
 		{
 			btVector3 hor = getRayTo(0,0)-getRayTo(1,0);
 			btVector3 vert = getRayTo(0,0)-getRayTo(0,1);
-			btScalar multiplierX = btScalar(0.01);
-			btScalar multiplierY = btScalar(0.01);
+			btScalar multiplierX = btScalar(0.001);
+			btScalar multiplierY = btScalar(0.001);
 			if (m_ortho)
 			{
 				multiplierX = 1;
@@ -979,7 +979,7 @@ void	DemoApplication::mouseMotionFunc(int x,int y)
 		} 
 		else if(m_mouseButtons & 4) 
 		{
-			m_cameraDistance -= dy * btScalar(0.2f);
+			m_cameraDistance -= dy * btScalar(0.02f);
 			if (m_cameraDistance<btScalar(0.1))
 				m_cameraDistance = btScalar(0.1);
 
@@ -1324,8 +1324,10 @@ void DemoApplication::renderme()
 	updateCamera();
 
 }
+
 #include "BulletCollision/BroadphaseCollision/btAxisSweep3.h"
 
+
 void	DemoApplication::clientResetScene()
 {
 #ifdef SHOW_NUM_DEEP_PENETRATIONS
@@ -1365,7 +1367,8 @@ void	DemoApplication::clientResetScene()
 					//colObj->setActivationState(WANTS_DEACTIVATION);
 				}
 				//removed cached contact points (this is not necessary if all objects have been removed from the dynamics world)
-				//m_dynamicsWorld->getBroadphase()->getOverlappingPairCache()->cleanProxyFromPairs(colObj->getBroadphaseHandle(),getDynamicsWorld()->getDispatcher());
+				if (m_dynamicsWorld->getBroadphase()->getOverlappingPairCache())
+					m_dynamicsWorld->getBroadphase()->getOverlappingPairCache()->cleanProxyFromPairs(colObj->getBroadphaseHandle(),getDynamicsWorld()->getDispatcher());
 
 				btRigidBody* body = btRigidBody::upcast(colObj);
 				if (body && !body->isStaticObject())
diff --git a/Demos/ParticlesOpenCL/NVidia/CMakeLists.txt b/Demos/ParticlesOpenCL/NVidia/CMakeLists.txt
index 87f473e1d..3b17c49ed 100644
--- a/Demos/ParticlesOpenCL/NVidia/CMakeLists.txt
+++ b/Demos/ParticlesOpenCL/NVidia/CMakeLists.txt
@@ -10,14 +10,14 @@ ${BULLET_PHYSICS_SOURCE_DIR}/Demos/OpenGL
 IF(INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 	INCLUDE_DIRECTORIES( $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/inc	)
 	IF (CMAKE_CL_64)
-		SET(CMAK_NVSDKCOMPUTE_LIBPATH		)
+		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/x64    )
 	ELSE(CMAKE_CL_64)
-		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/x64	)
+		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/Win32	)
 	ENDIF(CMAKE_CL_64)
 ELSE()
 	INCLUDE_DIRECTORIES( $ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/inc	)
 	IF (CMAKE_CL_64)
-		SET(CMAK_NVSDKCOMPUTE_LIBPATH		)
+		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/lib/x64 )
 	ELSE(CMAKE_CL_64)
 		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/lib/Win32	)
 	ENDIF(CMAKE_CL_64)
@@ -35,7 +35,6 @@ IF (USE_GLUT)
 		OpenGLSupport 
 		BulletDynamics  
 		BulletCollision 
-		BulletMultiThreaded 
 		LinearMath 
 		${GLUT_glut_LIBRARY} 
 		${OPENGL_gl_LIBRARY} 
@@ -74,8 +73,6 @@ IF (CMAKE_CL_64)
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/glut64.dll ${CMAKE_CURRENT_BINARY_DIR}	)
 		ADD_CUSTOM_COMMAND(	TARGET AppParticlesOCL_Nv	POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW64.DLL ${CMAKE_CURRENT_BINARY_DIR})
-		ADD_CUSTOM_COMMAND(	TARGET AppParticlesOCL_Nv	POST_BUILD
-					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesOCL.cl ${CMAKE_CURRENT_BINARY_DIR})
 	ENDIF()
 ELSE(CMAKE_CL_64)
 	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
@@ -83,10 +80,13 @@ ELSE(CMAKE_CL_64)
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLUT32.DLL ${CMAKE_CURRENT_BINARY_DIR})
 		ADD_CUSTOM_COMMAND(	TARGET AppParticlesOCL_Nv	POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/GLEW32.DLL ${CMAKE_CURRENT_BINARY_DIR})
+	ENDIF()
+ENDIF(CMAKE_CL_64)
+
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 		ADD_CUSTOM_COMMAND(	TARGET AppParticlesOCL_Nv	POST_BUILD
 					COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${BULLET_PHYSICS_SOURCE_DIR}/Demos/ParticlesOpenCL/ParticlesOCL.cl ${CMAKE_CURRENT_BINARY_DIR})
 	ENDIF()
-ENDIF(CMAKE_CL_64)
 ENDIF(WIN32)
 
 IF (UNIX)
diff --git a/Demos/ParticlesOpenCL/ParticlesDemo.cpp b/Demos/ParticlesOpenCL/ParticlesDemo.cpp
index 6a789a2d3..331c73c22 100644
--- a/Demos/ParticlesOpenCL/ParticlesDemo.cpp
+++ b/Demos/ParticlesOpenCL/ParticlesDemo.cpp
@@ -22,8 +22,9 @@ subject to the following restrictions:
 //#define START_POS_Z btScalar(0.4f)
 #define ARRAY_SIZE_X 32
 #define ARRAY_SIZE_Y 32
-//#define ARRAY_SIZE_Y 5
+//#define ARRAY_SIZE_Y 16
 #define ARRAY_SIZE_Z 16
+//16
 //#define ARRAY_SIZE_Z 1
 //#define DIST btScalar(2.f)
 #define DIST (DEF_PARTICLE_RADIUS * 2.f)
@@ -65,7 +66,7 @@ subject to the following restrictions:
 
 
 
-btScalar gTimeStep = btScalar(1./60.);
+btScalar gTimeStep = 0.5f;//btScalar(1./60.);
 
 #define SCALING btScalar(1.f)
 
@@ -254,6 +255,7 @@ void	ParticlesDemo::initPhysics()
 
 	{
 //		btCollisionShape* colShape = new btSphereShape(btScalar(1.0f));
+/*	
 		btCollisionShape* colShape = new btSphereShape(DEF_PARTICLE_RADIUS);
 		m_collisionShapes.push_back(colShape);
 		btTransform startTransform;
@@ -269,37 +271,53 @@ void	ParticlesDemo::initPhysics()
 		rbInfo.m_startWorldTransform = startTransform;
 		btRigidBody* body = new btRigidBody(rbInfo);
 		m_pWorld->addRigidBody(body);
+		*/
+
 		init_scene_directly();
 	}
 	clientResetScene();
 	m_pWorld->initDeviceData();
 }
 
-static float frand(void) { return  2.0f * (float)rand()/(float)RAND_MAX  - 1.0f; }
+inline float frand(void){
+    return (float)rand() / (float)RAND_MAX;
+}
 
 void ParticlesDemo::init_scene_directly()
 {
-	float start_x = START_POS_X - ARRAY_SIZE_X * DIST * btScalar(0.5f);
-	float start_y = START_POS_Y - ARRAY_SIZE_Y * DIST * btScalar(0.5f);
-	float start_z = START_POS_Z - ARRAY_SIZE_Z * DIST * btScalar(0.5f);
-	int total = ARRAY_SIZE_X * ARRAY_SIZE_Y * ARRAY_SIZE_Z;
-	m_pWorld->m_hPos.resize(total);
-	m_pWorld->m_hVel.resize(total);
-	total = 0;
-	for (int k=0;k<ARRAY_SIZE_Y;k++)
-	{
-		for (int i=0;i<ARRAY_SIZE_X;i++)
-		{
-			for(int j = 0;j<ARRAY_SIZE_Z;j++)
-			{
-				m_pWorld->m_hVel[total] = btVector3(0., 0., 0.);
-				btVector3 jitter = 0.01f * 0.03f * btVector3(frand(), frand(), frand());
-				m_pWorld->m_hPos[total] = btVector3(DIST*i + start_x, DIST*k + start_y, DIST*j + start_z) + jitter;
-				total++;
-			}
-		}
-	}
-	m_pWorld->m_numParticles = total;
+
+
+	srand(1969);
+	float start_x = -1+DEF_PARTICLE_RADIUS;//START_POS_X - ARRAY_SIZE_X * DIST * btScalar(0.5f);
+	float start_y = -1+DEF_PARTICLE_RADIUS;//START_POS_Y - ARRAY_SIZE_Y * DIST * btScalar(0.5f);
+	float start_z = -1+DEF_PARTICLE_RADIUS;//START_POS_Z - ARRAY_SIZE_Z * DIST * btScalar(0.5f);
+	int numParticles = ARRAY_SIZE_X * ARRAY_SIZE_Y * ARRAY_SIZE_Z;
+	m_pWorld->m_hPos.resize(numParticles);
+	m_pWorld->m_hVel.resize(numParticles);
+	
+	btScalar spacing = 2 * DEF_PARTICLE_RADIUS;
+
+	for(int z=0; z<ARRAY_SIZE_Z; z++) 
+    {
+        for(int y=0; y<ARRAY_SIZE_Y; y++) 
+        {
+            for(int x=0; x<ARRAY_SIZE_X; x++) 
+            {
+                int i = (z * ARRAY_SIZE_Y * ARRAY_SIZE_X) + (y * ARRAY_SIZE_X) + x;
+                if (i < numParticles) 
+                {
+					btVector3 jitter = 0.01f * 0.03f * btVector3(frand(), frand(), frand());
+					m_pWorld->m_hVel[i]= btVector3(0,0,0);
+					m_pWorld->m_hPos[i].setX((spacing * x) + DEF_PARTICLE_RADIUS -WORLD_SIZE+jitter.getX());
+					m_pWorld->m_hPos[i].setY((spacing * y) + DEF_PARTICLE_RADIUS -WORLD_SIZE+jitter.getY());
+					m_pWorld->m_hPos[i].setZ((spacing * z) + DEF_PARTICLE_RADIUS -WORLD_SIZE+jitter.getZ());
+                }
+            }
+        }
+    }
+
+	m_pWorld->m_numParticles = numParticles;
+	
 }
 
 
@@ -314,6 +332,7 @@ void ParticlesDemo::clientResetScene()
 	}
 	else
 	{
+	
 		m_pWorld->grabSimulationData();
 	}
 }
@@ -392,7 +411,7 @@ void ParticlesDemo::renderme()
 {
 
     glColor3f(1.0, 1.0, 1.0);
-    glutWireCube(2.0);
+	glutWireCube(2*WORLD_SIZE);
 
 	glPointSize(5.0f);
 	glEnable(GL_POINT_SPRITE_ARB);
diff --git a/Demos/ParticlesOpenCL/ParticlesOCL.cl b/Demos/ParticlesOpenCL/ParticlesOCL.cl
index 16407cedc..2ddf9cd9f 100644
--- a/Demos/ParticlesOpenCL/ParticlesOCL.cl
+++ b/Demos/ParticlesOpenCL/ParticlesOCL.cl
@@ -81,7 +81,7 @@ __kernel void kFindCellStart(	int numParticles,
 								__global float4* pSortedVel GUID_ARG)
 {
     int index = get_global_id(0);
-	__local int sharedHash[513];
+	__local int sharedHash[1025];//maximum workgroup size 1024
 	int2 sortedData;
 	
     if(index < numParticles)
@@ -142,6 +142,8 @@ __kernel void kIntegrateMotion(	int numParticles,
     // collide with world boundaries
     float4 worldMin = *((__global float4*)(pParams + 1));
     float4 worldMax = *((__global float4*)(pParams + 2));
+    
+    
     if(pos.x < (worldMin.x + particleRad))
     {
         pos.x = worldMin.x + particleRad;
@@ -216,7 +218,6 @@ float4 collideTwoParticles(
 }
 
 
-
 __kernel void kCollideParticles(int numParticles,
 								__global float4*		pVel,          //output: new velocity
 								__global const float4* pSortedPos, //input: reordered positions
@@ -264,8 +265,10 @@ __kernel void kCollideParticles(int numParticles,
                     continue;
                 }
                //Iterate over particles in this cell
-                int endI = startI + 8;
-                if(endI >= numParticles) endI = numParticles - 1;
+                int endI = startI + 32;
+                if(endI >= numParticles) 
+					endI = numParticles ;
+					
                 for(int j = startI; j < endI; j++)
                 {
 					uint hashC = pPosHash[j].x;
diff --git a/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp b/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp
index 766e54a9a..2f8ca5e6a 100644
--- a/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp
+++ b/Demos/ParticlesOpenCL/btParticlesDemoDynamicsWorld.cpp
@@ -74,6 +74,7 @@ int	btParticlesDynamicsWorld::stepSimulation( btScalar timeStep, int maxSubSteps
 	}
 	{
 		BT_PROFILE("CollideParticles");
+//		printf("\ncollide particles\n\n");
 		runCollideParticlesKernel();
 	}
 	gStepNum++;
@@ -212,8 +213,8 @@ void btParticlesDynamicsWorld::adjustGrid()
 	m_worldMin -= wsize;
 	m_worldMax += wsize;
 */
-	m_worldMin.setValue(-1.f, -1.f, -1.f);
-	m_worldMax.setValue( 1.f,  1.f,  1.f);
+	m_worldMin.setValue(-WORLD_SIZE, -WORLD_SIZE, -WORLD_SIZE);
+	m_worldMax.setValue( WORLD_SIZE,  WORLD_SIZE,  WORLD_SIZE);
 	wsize = m_worldMax - m_worldMin;
 
 	m_cellSize[0] = m_cellSize[1] = m_cellSize[2] = m_particleRad * btScalar(2.f);
@@ -246,7 +247,12 @@ void btParticlesDynamicsWorld::adjustGrid()
 void btParticlesDynamicsWorld::grabSimulationData()
 {
 //	const btVector3& gravity = getGravity();
-	btVector3 gravity(0., -0.06, 0.);
+	//btVector3 gravity(0., -0.06, 0.);
+	//btVector3 gravity(0., -0.0003f, 0.);
+	btVector3 gravity(0,-0.0003,0);
+
+	
+
 	m_simParams.m_gravity[0] = gravity[0];
 	m_simParams.m_gravity[1] = gravity[1];
 	m_simParams.m_gravity[2] = gravity[2];
@@ -258,7 +264,7 @@ void btParticlesDynamicsWorld::grabSimulationData()
 //	m_simParams.m_spring = 0.5f;
 //	m_simParams.m_shear = 0.1f;
 //	m_simParams.m_attraction = 0.0f;
-	m_simParams.m_collisionDamping = 0.02f;
+	m_simParams.m_collisionDamping = 0.025f;//0.02f;
 	m_simParams.m_spring = 0.5f;
 	m_simParams.m_shear = 0.1f;
 	m_simParams.m_attraction = 0.0f;
@@ -427,6 +433,7 @@ void btParticlesDynamicsWorld::initCLKernels(int argc, char** argv)
 	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 1, sizeof(cl_mem),	(void*) &m_dCellStart);
 
 	initKernel(PARTICLES_KERNEL_FIND_CELL_START, "kFindCellStart");
+//	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 0, sizeof(int),	(void*) &m_numParticles);
 	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 1, sizeof(cl_mem),	(void*) &m_dPosHash);
 	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 2, sizeof(cl_mem),	(void*) &m_dCellStart);
 	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 3, sizeof(cl_mem),	(void*) &m_dPos);
@@ -560,15 +567,28 @@ static btVector3 cpu_collideTwoParticles(
         float relVelDotNorm = relVel.dot(norm);
 		btVector3 tanVel = relVel - relVelDotNorm * norm; 
         //Spring force (potential)
-        float springFactor = -spring * (collideDist - dist);
-		force = springFactor * norm + damping * relVel + shear * tanVel + attraction * relPos;
+        //float springFactor = -spring * (collideDist - dist);
+		float springFactor = -0.4 * (collideDist - dist);
+		force = springFactor * norm + damping * relVel;// + shear * tanVel + attraction * relPos;
     }
     return force;
 }
 
+struct btPair
+{
+	union
+	{
+		int value;
+		short v0[2];
+	};
+};
 
 void btParticlesDynamicsWorld::runCollideParticlesKernel()
 {
+	btAlignedObjectArray<int>	pairs;
+
+	float particleRad = m_simParams.m_particleRad;
+	float collideDist2 = (particleRad + particleRad)*(particleRad + particleRad);
 	cl_int ciErrNum;
 	if(m_useCpuControls[SIMSTAGE_COLLIDE_PARTICLES]->m_active)
 	{	// CPU version
@@ -585,6 +605,7 @@ void btParticlesDynamicsWorld::runCollideParticlesKernel()
 			ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dCellStart, CL_TRUE, 0, memSize, &(m_hCellStart[0]), 0, NULL, NULL);
 			oclCHECKERROR(ciErrNum, CL_SUCCESS);
 		}
+
 		for(int index = 0; index < m_numParticles; index++)
 		{
 			btVector3 posA = m_hSortedPos[index];
@@ -618,7 +639,10 @@ void btParticlesDynamicsWorld::runCollideParticlesKernel()
 							continue;
 						}
 						//Iterate over particles in this cell
-						int endI = startI + 8;
+						int endI = startI + 32;
+						if(endI > m_numParticles) 
+							endI = m_numParticles;
+
 						for(int j = startI; j < endI; j++)
 						{
 							unsigned int hashC = m_hPosHash[j].x;
@@ -630,6 +654,14 @@ void btParticlesDynamicsWorld::runCollideParticlesKernel()
 							{
 								continue;
 							}
+
+							btPair pair;
+							pair.v0[0] = index;
+							pair.v0[1] = j;
+							pairs.push_back(pair.value);
+
+//							printf("index=%d, j=%d\n",index,j);
+//							printf("(index=%d, j=%d) ",index,j);
 							btVector3 posB = m_hSortedPos[j];
 							btVector3 velB = m_hSortedVel[j];
 							//Collide two spheres
@@ -642,6 +674,56 @@ void btParticlesDynamicsWorld::runCollideParticlesKernel()
 			//Write new velocity back to original unsorted location
 			m_hVel[unsortedIndex] = velA + force;
 		}	
+
+//#define BRUTE_FORCE_CHECK 1
+#ifdef BRUTE_FORCE_CHECK
+		for(int index = 0; index < m_numParticles; index++)
+		{
+			btVector3 posA = m_hSortedPos[index];
+			btVector3 velA = m_hSortedVel[index];
+			btVector3 force = btVector3(0, 0, 0);
+			int unsortedIndex = m_hPosHash[index].y;
+			
+			float collisionDamping = m_simParams.m_collisionDamping;
+			float spring = m_simParams.m_spring;
+			float shear = m_simParams.m_shear;
+			float attraction = m_simParams.m_attraction;
+			for(int j = 0 ; j < m_numParticles; j++)
+			{
+				if (index!=j)
+				{
+					btVector3 posB = m_hSortedPos[j];
+					btVector3 velB = m_hSortedVel[j];
+
+
+					btVector3  relPos = posB - posA; relPos[3] = 0.f;
+					float        dist2 = (relPos[0] * relPos[0] + relPos[1] * relPos[1] + relPos[2] * relPos[2]);
+					
+
+					
+					if(dist2 < collideDist2)
+					{
+										//Collide two spheres
+						//				force += cpu_collideTwoParticles(	posA, posB, velA, velB, particleRad, particleRad, 
+						//													spring, collisionDamping, shear, attraction);
+
+						btPair pair;
+						pair.v0[0] = index;
+						pair.v0[1] = j;
+						if (pairs.findLinearSearch(pair.value)==pairs.size())
+						{
+							printf("not found index=%d, j=%d\n",index,j);
+						} 
+
+										
+					}
+				}
+			}
+			//Write new velocity back to original unsorted location
+			//m_hVel[unsortedIndex] = velA + force;
+		}
+#endif //BRUTE_FORCE_CHECK
+
 		memSize = sizeof(btVector3) * m_numParticles;
 		ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL);
 		oclCHECKERROR(ciErrNum, CL_SUCCESS);
@@ -675,7 +757,11 @@ void btParticlesDynamicsWorld::runIntegrateMotionKernel()
 			pos[3] = 1.0f;
 			vel[3] = 0.0f;
 			// apply gravity
-			btVector3 gravity = *((btVector3*)(m_simParams.m_gravity));
+			btVector3 gravity;
+			gravity[0] = m_simParams.m_gravity[0];
+			gravity[1] = m_simParams.m_gravity[1];
+			gravity[2] = m_simParams.m_gravity[2];
+
 			float particleRad = m_simParams.m_particleRad;
 			float globalDamping = m_simParams.m_globalDamping;
 			float boundaryDamping = m_simParams.m_boundaryDamping;
@@ -684,8 +770,16 @@ void btParticlesDynamicsWorld::runIntegrateMotionKernel()
 			// integrate position
 			pos += vel * m_timeStep;
 			// collide with world boundaries
-			btVector3 worldMin = *((btVector3*)(m_simParams.m_worldMin));
-			btVector3 worldMax = *((btVector3*)(m_simParams.m_worldMax));
+			btVector3 worldMin;
+			worldMin[0] = m_simParams.m_worldMin[0];
+			worldMin[1] = m_simParams.m_worldMin[1];
+			worldMin[2] = m_simParams.m_worldMin[2];
+
+			btVector3 worldMax;
+			worldMax[0] = m_simParams.m_worldMax[0];
+			worldMax[1] = m_simParams.m_worldMax[1];
+			worldMax[2] = m_simParams.m_worldMax[2];
+
 			for(int j = 0; j < 3; j++)
 			{
 				if(pos[j] < (worldMin[j] + particleRad))
@@ -800,7 +894,7 @@ void btParticlesDynamicsWorld::runSortHashKernel()
 			}
 		};
 		btHashPosKey* pHash = (btHashPosKey*)(&m_hPosHash[0]);
-		pHash->quickSort(pHash, 0, m_numParticles - 1);
+		pHash->quickSort(pHash, 0, m_numParticles );
 	//	pHash->bitonicSort(pHash, 0, m_hashSize, true);
 		// write back to GPU
 		ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);
@@ -899,8 +993,19 @@ void btParticlesDynamicsWorld::initKernel(int kernelId, char* pName)
 	size_t wgSize;
 	ciErrNum = clGetKernelWorkGroupInfo(kernel, m_cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
 	oclCHECKERROR(ciErrNum, CL_SUCCESS);
-//	if (wgSize > 64)
-//		wgSize = 64;
+	
+
+	
+
+//	if (wgSize > 256)
+//		wgSize = 256;
+
+	if (wgSize > 512)
+		wgSize = 512;
+
+//	if (wgSize > 1024)
+//		wgSize = 1024;
+
 	m_kernels[kernelId].m_Id = kernelId;
 	m_kernels[kernelId].m_kernel = kernel;
 	m_kernels[kernelId].m_name = pName;
diff --git a/Demos/ParticlesOpenCL/btParticlesDynamicsWorld.h b/Demos/ParticlesOpenCL/btParticlesDynamicsWorld.h
index 8a3be82f7..0076b651e 100644
--- a/Demos/ParticlesOpenCL/btParticlesDynamicsWorld.h
+++ b/Demos/ParticlesOpenCL/btParticlesDynamicsWorld.h
@@ -45,6 +45,7 @@ subject to the following restrictions:
 #define PARTICLES_MAX_PARTICLES (65536)
 #define PARTICLES_MAX_NEIGHBORS (32)
 #define DEF_PARTICLE_RADIUS (0.023f)
+#define WORLD_SIZE 1.9f
 
 enum
 {
diff --git a/Demos/ParticlesOpenCL/shaders.cpp b/Demos/ParticlesOpenCL/shaders.cpp
index eea1f8ade..59f028c97 100644
--- a/Demos/ParticlesOpenCL/shaders.cpp
+++ b/Demos/ParticlesOpenCL/shaders.cpp
@@ -1,5 +1,6 @@
 #define STRINGIFY(A) #A
 
+#ifdef USE_AMD_OPENCL
 // vertex shader
 const char *vertexShader = STRINGIFY(
 uniform float pointRadius;  // point size in world space
@@ -17,16 +18,40 @@ void main()
 //hack around latest AMD graphics cards having troubles with point sprite rendering
 //the problem is still unresolved on the 5870 card, and results in a black screen
 //see also http://forums.amd.com/devforum/messageview.cfm?catid=392&threadid=129431
-#ifdef USE_AMD_OPENCL
     gl_TexCoord = gl_MultiTexCoord0;
-#else
-	gl_TexCoord[0] = gl_MultiTexCoord0;
-#endif
-    gl_Position = gl_ModelViewProjectionMatrix * vec4(gl_Vertex.xyz, 1.0);
+
+	gl_Position = gl_ModelViewProjectionMatrix * vec4(gl_Vertex.xyz, 1.0);
 
     gl_FrontColor = gl_Color;
 }
 );
+#else
+// vertex shader
+const char *vertexShader = STRINGIFY(
+uniform float pointRadius;  // point size in world space
+uniform float pointScale;   // scale to calculate size in pixels
+uniform float densityScale;
+uniform float densityOffset;
+varying vec3 posEye;
+void main()
+{
+    // calculate window-space point size
+    posEye = vec3(gl_ModelViewMatrix * vec4(gl_Vertex.xyz, 1.0));
+    float dist = length(posEye);
+    gl_PointSize = pointRadius * (pointScale / dist);
+//    gl_PointSize = 4.0;
+//hack around latest AMD graphics cards having troubles with point sprite rendering
+//the problem is still unresolved on the 5870 card, and results in a black screen
+//see also http://forums.amd.com/devforum/messageview.cfm?catid=392&threadid=129431
+	gl_TexCoord[0] = gl_MultiTexCoord0;
+
+	gl_Position = gl_ModelViewProjectionMatrix * vec4(gl_Vertex.xyz, 1.0);
+
+    gl_FrontColor = gl_Color;
+}
+);
+
+#endif 
 
 // pixel shader for rendering points as shaded spheres
 const char *spherePixelShader = STRINGIFY(
diff --git a/Demos/SharedOpenCL/btOclUtils.cpp b/Demos/SharedOpenCL/btOclUtils.cpp
index bca361449..7af73b92a 100644
--- a/Demos/SharedOpenCL/btOclUtils.cpp
+++ b/Demos/SharedOpenCL/btOclUtils.cpp
@@ -9,6 +9,7 @@
 
 
 
+
 //////////////////////////////////////////////////////////////////////////////
 //! Gets the id of the nth device from the context
 //!
diff --git a/Demos/ThreadingDemo/main.cpp b/Demos/ThreadingDemo/main.cpp
index e5168a76c..a828dc6cd 100644
--- a/Demos/ThreadingDemo/main.cpp
+++ b/Demos/ThreadingDemo/main.cpp
@@ -23,6 +23,8 @@ subject to the following restrictions:
 void	SampleThreadFunc(void* userPtr,void* lsMemory);
 void*	SamplelsMemoryFunc();
 
+#include <stdio.h>
+
 
 #ifdef __APPLE__
 #include "BulletMultiThreaded/PosixThreadSupport.h"
@@ -148,7 +150,7 @@ int main(int argc,char** argv)
 		int numActiveThreads = numThreads;
 		while (numActiveThreads)
 		{
-			if (threadSupport->isTaskCompleted(&arg0,&arg1,0))
+			if (((Win32ThreadSupport*)threadSupport)->isTaskCompleted(&arg0,&arg1,0))
 			{
 				numActiveThreads--;
 				printf("numActiveThreads = %d\n",numActiveThreads);
diff --git a/Demos/VectorAdd_OpenCL/MiniCL_VectorAdd.cpp b/Demos/VectorAdd_OpenCL/MiniCL_VectorAdd.cpp
index 92db86225..e838c6768 100644
--- a/Demos/VectorAdd_OpenCL/MiniCL_VectorAdd.cpp
+++ b/Demos/VectorAdd_OpenCL/MiniCL_VectorAdd.cpp
@@ -192,7 +192,7 @@ int main(int argc, char **argv)
     };
 
     // Create OpenCL context & context
-    cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_CPU, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
+    cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
 	
     // Query all devices available to the context
     ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
diff --git a/Demos/VectorAdd_OpenCL/NVidia/CMakeLists.txt b/Demos/VectorAdd_OpenCL/NVidia/CMakeLists.txt
index 30db18d2e..05adb0575 100644
--- a/Demos/VectorAdd_OpenCL/NVidia/CMakeLists.txt
+++ b/Demos/VectorAdd_OpenCL/NVidia/CMakeLists.txt
@@ -4,16 +4,16 @@
 IF(INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
 	INCLUDE_DIRECTORIES( $ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/inc	)
 	IF (CMAKE_CL_64)
-		SET(CMAKE_NVSDKCOMPUTE_LIBPATH		)
+		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/x64    )
 	ELSE(CMAKE_CL_64)
-		SET(CMAKE_NVSDKCOMPUTE_LIBPATH		$ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/x64	)
+		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{==NVSDKCOMPUTE_ROOT=}/OpenCL/common/lib/Win32	)
 	ENDIF(CMAKE_CL_64)
 ELSE()
 	INCLUDE_DIRECTORIES( $ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/inc	)
 	IF (CMAKE_CL_64)
-		SET(CMAKE_NVSDKCOMPUTE_LIBPATH		)
+		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/lib/x64 )
 	ELSE(CMAKE_CL_64)
-		SET(CMAKE_NVSDKCOMPUTE_LIBPATH		$ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/lib/Win32	)
+		SET(CMAK_NVSDKCOMPUTE_LIBPATH		$ENV{NVSDKCOMPUTE_ROOT}/OpenCL/common/lib/Win32	)
 	ENDIF(CMAKE_CL_64)
 ENDIF()
 
@@ -22,8 +22,8 @@ ${BULLET_PHYSICS_SOURCE_DIR}/src
 )
 
 LINK_LIBRARIES(
-	BulletMultiThreaded LinearMath 
-	${CMAKE_NVSDKCOMPUTE_LIBPATH}/OpenCL.lib
+	 LinearMath 
+	${CMAK_NVSDKCOMPUTE_LIBPATH}/OpenCL.lib
 )
 
 ADD_EXECUTABLE(AppVectorAdd_NVidia
diff --git a/src/BulletCollision/NarrowPhaseCollision/btManifoldPoint.h b/src/BulletCollision/NarrowPhaseCollision/btManifoldPoint.h
index 5987699f1..53c9727bc 100644
--- a/src/BulletCollision/NarrowPhaseCollision/btManifoldPoint.h
+++ b/src/BulletCollision/NarrowPhaseCollision/btManifoldPoint.h
@@ -19,6 +19,15 @@ subject to the following restrictions:
 #include "LinearMath/btVector3.h"
 #include "LinearMath/btTransformUtil.h"
 
+// Don't change following order of parameters
+ATTRIBUTE_ALIGNED16(struct) PfxConstraintRow {
+	btScalar mNormal[3];
+	btScalar mRhs;
+	btScalar mJacDiagInv;
+	btScalar mLowerLimit;
+	btScalar mUpperLimit;
+	btScalar mAccumImpulse;
+};
 
 
 
@@ -62,8 +71,9 @@ class btManifoldPoint
 					m_contactCFM2(0.f),
 					m_lifeTime(0)
 			{
-				
-					
+				mConstraintRow[0].mAccumImpulse = 0.f;
+				mConstraintRow[1].mAccumImpulse = 0.f;
+				mConstraintRow[2].mAccumImpulse = 0.f;
 			}
 
 			
@@ -101,6 +111,11 @@ class btManifoldPoint
 			btVector3		m_lateralFrictionDir1;
 			btVector3		m_lateralFrictionDir2;
 
+
+
+			PfxConstraintRow mConstraintRow[3];
+
+
 			btScalar getDistance() const
 			{
 				return m_distance1;
diff --git a/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h b/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h
index c075c5480..9206129ed 100644
--- a/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h
+++ b/src/BulletCollision/NarrowPhaseCollision/btPersistentManifold.h
@@ -48,7 +48,10 @@ enum btContactManifoldTypes
 ///reduces the cache to 4 points, when more then 4 points are added, using following rules:
 ///the contact point with deepest penetration is always kept, and it tries to maximuze the area covered by the points
 ///note that some pairs of objects might have more then one contact manifold.
-ATTRIBUTE_ALIGNED16( class) btPersistentManifold : public btTypedObject
+
+
+ATTRIBUTE_ALIGNED128( class) btPersistentManifold : public btTypedObject
+//ATTRIBUTE_ALIGNED16( class) btPersistentManifold : public btTypedObject
 {
 
 	btManifoldPoint m_pointCache[MANIFOLD_CACHE_SIZE];
@@ -57,6 +60,7 @@ ATTRIBUTE_ALIGNED16( class) btPersistentManifold : public btTypedObject
 	/// void* will allow any rigidbody class
 	void* m_body0;
 	void* m_body1;
+
 	int	m_cachedPoints;
 
 	btScalar	m_contactBreakingThreshold;
@@ -72,6 +76,9 @@ public:
 
 	BT_DECLARE_ALIGNED_ALLOCATOR();
 
+	int	m_companionIdA;
+	int	m_companionIdB;
+
 	int m_index1a;
 
 	btPersistentManifold();
@@ -139,6 +146,10 @@ public:
 			m_pointCache[index] = m_pointCache[lastUsedIndex]; 
 			//get rid of duplicated userPersistentData pointer
 			m_pointCache[lastUsedIndex].m_userPersistentData = 0;
+			m_pointCache[lastUsedIndex].mConstraintRow[0].mAccumImpulse = 0.f;
+			m_pointCache[lastUsedIndex].mConstraintRow[1].mAccumImpulse = 0.f;
+			m_pointCache[lastUsedIndex].mConstraintRow[2].mAccumImpulse = 0.f;
+
 			m_pointCache[lastUsedIndex].m_appliedImpulse = 0.f;
 			m_pointCache[lastUsedIndex].m_lateralFrictionInitialized = false;
 			m_pointCache[lastUsedIndex].m_appliedImpulseLateral1 = 0.f;
@@ -156,10 +167,13 @@ public:
 #define MAINTAIN_PERSISTENCY 1
 #ifdef MAINTAIN_PERSISTENCY
 		int	lifeTime = m_pointCache[insertIndex].getLifeTime();
-		btScalar	appliedImpulse = m_pointCache[insertIndex].m_appliedImpulse;
-		btScalar	appliedLateralImpulse1 = m_pointCache[insertIndex].m_appliedImpulseLateral1;
-		btScalar	appliedLateralImpulse2 = m_pointCache[insertIndex].m_appliedImpulseLateral2;
-				
+		btScalar	appliedImpulse = m_pointCache[insertIndex].mConstraintRow[0].mAccumImpulse;
+		btScalar	appliedLateralImpulse1 = m_pointCache[insertIndex].mConstraintRow[1].mAccumImpulse;
+		btScalar	appliedLateralImpulse2 = m_pointCache[insertIndex].mConstraintRow[2].mAccumImpulse;
+		bool isLateralFrictionInitialized = m_pointCache[insertIndex].m_lateralFrictionInitialized;
+		
+		
+			
 		btAssert(lifeTime>=0);
 		void* cache = m_pointCache[insertIndex].m_userPersistentData;
 		
@@ -170,6 +184,11 @@ public:
 		m_pointCache[insertIndex].m_appliedImpulseLateral1 = appliedLateralImpulse1;
 		m_pointCache[insertIndex].m_appliedImpulseLateral2 = appliedLateralImpulse2;
 		
+		m_pointCache[insertIndex].mConstraintRow[0].mAccumImpulse =  appliedImpulse;
+		m_pointCache[insertIndex].mConstraintRow[1].mAccumImpulse = appliedLateralImpulse1;
+		m_pointCache[insertIndex].mConstraintRow[2].mAccumImpulse = appliedLateralImpulse2;
+
+
 		m_pointCache[insertIndex].m_lifeTime = lifeTime;
 #else
 		clearUserCache(m_pointCache[insertIndex]);
diff --git a/src/BulletDynamics/ConstraintSolver/btSolverConstraint.h b/src/BulletDynamics/ConstraintSolver/btSolverConstraint.h
index 929cf6d3e..0ce25e783 100644
--- a/src/BulletDynamics/ConstraintSolver/btSolverConstraint.h
+++ b/src/BulletDynamics/ConstraintSolver/btSolverConstraint.h
@@ -59,12 +59,12 @@ ATTRIBUTE_ALIGNED64 (struct)	btSolverConstraint
 	union
 	{
 		btRigidBody*	m_solverBodyA;
-		btScalar	m_unusedPadding2;
+		btScalar	m_companionIdA;
 	};
 	union
 	{
 		btRigidBody*	m_solverBodyB;
-		btScalar	m_unusedPadding3;
+		btScalar	m_companionIdB;
 	};
 	
 	union
diff --git a/src/BulletDynamics/Dynamics/btRigidBody.cpp b/src/BulletDynamics/Dynamics/btRigidBody.cpp
index 714058d5d..5328ed229 100644
--- a/src/BulletDynamics/Dynamics/btRigidBody.cpp
+++ b/src/BulletDynamics/Dynamics/btRigidBody.cpp
@@ -148,8 +148,8 @@ void btRigidBody::setGravity(const btVector3& acceleration)
 
 void btRigidBody::setDamping(btScalar lin_damping, btScalar ang_damping)
 {
-	m_linearDamping = GEN_clamped(lin_damping, (btScalar)btScalar(0.0), (btScalar)btScalar(1.0));
-	m_angularDamping = GEN_clamped(ang_damping, (btScalar)btScalar(0.0), (btScalar)btScalar(1.0));
+	m_linearDamping = btClamped(lin_damping, (btScalar)btScalar(0.0), (btScalar)btScalar(1.0));
+	m_angularDamping = btClamped(ang_damping, (btScalar)btScalar(0.0), (btScalar)btScalar(1.0));
 }
 
 
diff --git a/src/BulletMultiThreaded/CMakeLists.txt b/src/BulletMultiThreaded/CMakeLists.txt
index 31238b275..6a995d058 100644
--- a/src/BulletMultiThreaded/CMakeLists.txt
+++ b/src/BulletMultiThreaded/CMakeLists.txt
@@ -1,6 +1,6 @@
 INCLUDE_DIRECTORIES(
 	${BULLET_PHYSICS_SOURCE_DIR}/src
-	${BULLET_PHYSICS_SOURCE_DIR}/src/BulletMultiThreaded/vectormath/scalar/cpp
+	${VECTOR_MATH_INCLUDE}
 )
 
 ADD_LIBRARY(BulletMultiThreaded
@@ -37,6 +37,11 @@ ADD_LIBRARY(BulletMultiThreaded
 		SpuNarrowPhaseCollisionTask/Box.h
 		SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp
 		SpuNarrowPhaseCollisionTask/boxBoxDistance.h
+#		SPURS_PEGatherScatterTask/SpuPEGatherScatterTask.cpp
+#		SPURS_PEGatherScatterTask/SpuPEGatherScatterTask.h
+#		SpuPEGatherScatterTaskProcess.cpp
+#		SpuPEGatherScatterTaskProcess.h
+		
 		SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
 		SpuNarrowPhaseCollisionTask/SpuContactResult.h
 		SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
diff --git a/src/BulletMultiThreaded/HeapManager.h b/src/BulletMultiThreaded/HeapManager.h
new file mode 100644
index 000000000..7d0f499c4
--- /dev/null
+++ b/src/BulletMultiThreaded/HeapManager.h
@@ -0,0 +1,116 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef __HEAP_MANAGER_H__
+#define __HEAP_MANAGER_H__
+
+#ifdef __SPU__
+	#define HEAP_STACK_SIZE 32
+#else
+	#define HEAP_STACK_SIZE 64
+#endif
+
+#define MIN_ALLOC_SIZE 16
+
+
+class HeapManager
+{
+private:
+	ATTRIBUTE_ALIGNED16(unsigned char *mHeap);
+	ATTRIBUTE_ALIGNED16(unsigned int mHeapBytes);
+	ATTRIBUTE_ALIGNED16(unsigned char *mPoolStack[HEAP_STACK_SIZE]);
+	ATTRIBUTE_ALIGNED16(unsigned int mCurStack);
+	
+public:
+	enum {ALIGN16,ALIGN128};
+
+	HeapManager(unsigned char *buf,int bytes)
+	{
+		mHeap = buf;
+		mHeapBytes = bytes;
+		clear();
+	}
+	
+	~HeapManager()
+	{
+	}
+	
+	int getAllocated()
+	{
+		return (int)(mPoolStack[mCurStack]-mHeap);
+	}
+	
+	int getRest()
+	{
+		return mHeapBytes-getAllocated();
+	}
+
+	void *allocate(size_t bytes,int alignment = ALIGN16)
+	{
+		if(bytes <= 0) bytes = MIN_ALLOC_SIZE;
+		btAssert(mCurStack < (HEAP_STACK_SIZE-1));
+
+		
+#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64__)
+		unsigned long long p = (unsigned long long )mPoolStack[mCurStack];
+		if(alignment == ALIGN128) {
+			p = ((p+127) & 0xffffffffffffff80);
+			bytes = (bytes+127) & 0xffffffffffffff80;
+		}
+		else {
+			bytes = (bytes+15) & 0xfffffffffffffff0;
+		}
+
+		btAssert(bytes <=(mHeapBytes-(p-(unsigned long long )mHeap)) );
+		
+#else
+		unsigned long p = (unsigned long )mPoolStack[mCurStack];
+		if(alignment == ALIGN128) {
+			p = ((p+127) & 0xffffff80);
+			bytes = (bytes+127) & 0xffffff80;
+		}
+		else {
+			bytes = (bytes+15) & 0xfffffff0;
+		}
+		btAssert(bytes <=(mHeapBytes-(p-(unsigned long)mHeap)) );
+#endif
+		unsigned char * bla = (unsigned char *)(p + bytes);
+		mPoolStack[++mCurStack] = bla;
+		return (void*)p;
+	}
+
+	void deallocate(void *p)
+	{
+		(void) p;
+		mCurStack--;
+	}
+	
+	void clear()
+	{
+		mPoolStack[0] = mHeap;
+		mCurStack = 0;
+	}
+
+//	void printStack()
+//	{
+//		for(unsigned int i=0;i<=mCurStack;i++) {
+//			PRINTF("memStack %2d 0x%x\n",i,(uint32_t)mPoolStack[i]);
+//		}
+//	}
+
+};
+
+#endif
diff --git a/src/BulletMultiThreaded/MiniCL.cpp b/src/BulletMultiThreaded/MiniCL.cpp
index 129ee2734..56519013f 100644
--- a/src/BulletMultiThreaded/MiniCL.cpp
+++ b/src/BulletMultiThreaded/MiniCL.cpp
@@ -25,8 +25,9 @@ subject to the following restrictions:
 #include "MiniCLTaskScheduler.h"
 #include "MiniCLTask/MiniCLTask.h"
 #include "LinearMath/btMinMax.h"
+#include <stdio.h>
 
-//#define DEBUG_MINICL_KERNELS 1
+#define DEBUG_MINICL_KERNELS 1
 
 static char* spPlatformID = "MiniCL, SCEA";
 
@@ -316,7 +317,7 @@ static void* localBufMalloc(int size)
 	if((sLocalBufUsed + size16) > LOCAL_BUF_SIZE)
 	{ // reset
 		spLocalBufCurr = sLocalMemBuf;
-		while((int)spLocalBufCurr & 0x0F) spLocalBufCurr++; // align to 16 bytes
+		while((unsigned long)spLocalBufCurr & 0x0F) spLocalBufCurr++; // align to 16 bytes
 		sLocalBufUsed = 0;
 	}
 	void* ret = spLocalBufCurr;
@@ -339,8 +340,8 @@ CL_API_ENTRY cl_int CL_API_CALL clSetKernelArg(cl_kernel    clKernel ,
 		printf("error: clSetKernelArg arg_index (%d) exceeds %d\n",arg_index,MINI_CL_MAX_ARG);
 	} else
 	{
-//		if (arg_size>=MINICL_MAX_ARGLENGTH)
-		if (arg_size != MINICL_MAX_ARGLENGTH)
+		if (arg_size>MINICL_MAX_ARGLENGTH)
+		//if (arg_size != MINICL_MAX_ARGLENGTH)
 		{
 			printf("error: clSetKernelArg argdata too large: %d (maximum is %d)\n",arg_size,MINICL_MAX_ARGLENGTH);
 		} 
@@ -559,12 +560,12 @@ clGetKernelWorkGroupInfo(cl_kernel                   kernel ,
                          size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0
 {
 	if((wgi == CL_KERNEL_WORK_GROUP_SIZE)
-	 &&(sz == sizeof(int))
+	 &&(sz == sizeof(size_t))
 	 &&(ptr != NULL))
 	{
 		MiniCLKernel* miniCLKernel = (MiniCLKernel*)kernel;
 		MiniCLTaskScheduler* scheduler = miniCLKernel->m_scheduler;
-		*((int*)ptr) = scheduler->getMaxNumOutstandingTasks();
+		*((size_t*)ptr) = scheduler->getMaxNumOutstandingTasks();
 		return CL_SUCCESS;
 	}
 	else
diff --git a/src/BulletMultiThreaded/PlatformDefinitions.h b/src/BulletMultiThreaded/PlatformDefinitions.h
index 16362f4bc..68c16e15e 100644
--- a/src/BulletMultiThreaded/PlatformDefinitions.h
+++ b/src/BulletMultiThreaded/PlatformDefinitions.h
@@ -2,6 +2,18 @@
 #define TYPE_DEFINITIONS_H
 
 ///This file provides some platform/compiler checks for common definitions
+#include "LinearMath/btScalar.h"
+#include "LinearMath/btMinMax.h"
+
+#include <vectormath_aos.h>
+typedef Vectormath::Aos::Vector3    vmVector3;
+typedef Vectormath::Aos::Quat       vmQuat;
+typedef Vectormath::Aos::Matrix3    vmMatrix3;
+typedef Vectormath::Aos::Transform3 vmTransform3;
+typedef Vectormath::Aos::Point3		vmPoint3;
+
+
+
 
 #ifdef _WIN32
 
@@ -19,9 +31,11 @@ typedef union
 
 		typedef unsigned char     uint8_t;
 #ifndef __PHYSICS_COMMON_H__
+#ifndef __PFX_COMMON_H__
 #ifndef __BT_SKIP_UINT64_H
 		typedef unsigned long int uint64_t;
 #endif //__BT_SKIP_UINT64_H
+#endif //__PFX_COMMON_H__
 		typedef unsigned int      uint32_t;
 #endif //__PHYSICS_COMMON_H__
 		typedef unsigned short    uint16_t;
@@ -54,26 +68,22 @@ typedef union
 #include <stdio.h>		
 #define spu_printf printf	
 #define DWORD unsigned int
-		
 			typedef union
 			{
 			  unsigned long long ull;
 			  unsigned int ui[2];
 			  void *p;
 			} addr64;
-		
-		
-#else
-
-#include <stdio.h>		
-#define spu_printf printf	
-
 #endif // USE_LIBSPE2
-	
+
 #endif	//__CELLOS_LV2__
 	
 #endif
 
+#ifdef __SPU__
+#include <stdio.h>		
+#define printf spu_printf
+#endif
 
 /* Included here because we need uint*_t typedefs */
 #include "PpuAddressSpace.h"
diff --git a/src/BulletMultiThreaded/PpuAddressSpace.h b/src/BulletMultiThreaded/PpuAddressSpace.h
index f36fdfb3c..833b22bf9 100644
--- a/src/BulletMultiThreaded/PpuAddressSpace.h
+++ b/src/BulletMultiThreaded/PpuAddressSpace.h
@@ -8,8 +8,9 @@
 #pragma warning (disable: 4312)
 #endif //_WIN32
 
-#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64__) || defined(USE_ADDR64)
-typedef uint64_t ppu_address_t;
+
+#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64__)
+typedef unsigned __int64 ppu_address_t;
 #else
 
 typedef uint32_t ppu_address_t;
diff --git a/src/BulletMultiThreaded/SequentialThreadSupport.cpp b/src/BulletMultiThreaded/SequentialThreadSupport.cpp
index 4e9c822bb..8cc72418f 100644
--- a/src/BulletMultiThreaded/SequentialThreadSupport.cpp
+++ b/src/BulletMultiThreaded/SequentialThreadSupport.cpp
@@ -91,3 +91,79 @@ void SequentialThreadSupport::setNumTasks(int numTasks)
 {
 	printf("SequentialThreadSupport::setNumTasks(%d) is not implemented and has no effect\n",numTasks);
 }
+
+
+
+
+class btDummyBarrier : public btBarrier
+{
+private:
+		
+public:
+	btDummyBarrier()
+	{
+	}
+	
+	virtual ~btDummyBarrier()
+	{
+	}
+	
+	void sync()
+	{
+	}
+	
+	virtual void setMaxCount(int n) {}
+	virtual int  getMaxCount() {return 1;}
+};
+
+class btDummyCriticalSection : public btCriticalSection
+{
+	
+public:
+	btDummyCriticalSection()
+	{
+	}
+	
+	virtual ~btDummyCriticalSection()
+	{
+	}
+	
+	unsigned int getSharedParam(int i)
+	{
+		btAssert(i>=0&&i<31);
+		return mCommonBuff[i+1];
+	}
+	
+	void setSharedParam(int i,unsigned int p)
+	{
+		btAssert(i>=0&&i<31);
+		mCommonBuff[i+1] = p;
+	}
+	
+	void lock()
+	{
+		mCommonBuff[0] = 1;
+	}
+	
+	void unlock()
+	{
+		mCommonBuff[0] = 0;
+	}
+};
+
+
+
+
+btBarrier*	SequentialThreadSupport::createBarrier()
+{
+	return new btDummyBarrier();
+}
+
+btCriticalSection* SequentialThreadSupport::createCriticalSection()
+{
+	return new btDummyCriticalSection();
+	
+}
+
+
+
diff --git a/src/BulletMultiThreaded/SequentialThreadSupport.h b/src/BulletMultiThreaded/SequentialThreadSupport.h
index 514e54ee9..2b65a4842 100644
--- a/src/BulletMultiThreaded/SequentialThreadSupport.h
+++ b/src/BulletMultiThreaded/SequentialThreadSupport.h
@@ -85,10 +85,10 @@ public:
 	{
 		return 1;
 	}
-	virtual btBarrier*	createBarrier() { return 0;}
-
-	virtual btCriticalSection* createCriticalSection()  { return 0;};
+	virtual btBarrier*	createBarrier();
 
+	virtual btCriticalSection* createCriticalSection();
+	
 
 };
 
diff --git a/src/BulletMultiThreaded/SpuFakeDma.cpp b/src/BulletMultiThreaded/SpuFakeDma.cpp
index 62cef3961..61e1a49d6 100644
--- a/src/BulletMultiThreaded/SpuFakeDma.cpp
+++ b/src/BulletMultiThreaded/SpuFakeDma.cpp
@@ -174,6 +174,9 @@ int	cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid,
 {
 	char* mainMem = (char*)ea;
 	char* localStore = (char*)ls;
+
+//	printf("mainMem=%x, localStore=%x",mainMem,localStore);
+
 #ifdef USE_MEMCPY
 	memcpy(localStore,mainMem,size);
 #else
@@ -182,6 +185,7 @@ int	cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid,
 		localStore[i] = mainMem[i];
 	}	
 #endif //#ifdef USE_MEMCPY
+//	printf(" finished\n");
 	return 0;
 }
 
diff --git a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h
index 9bc2ebf51..e07c731ef 100644
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/Box.h
@@ -25,15 +25,11 @@ subject to the following restrictions:
 #include <math.h>
 
 ///only use a system-wide vectormath_aos.h on CELLOS_LV2 or if USE_SYSTEM_VECTORMATH
-#if defined(__CELLOS_LV2__) || defined (USE_SYSTEM_VECTORMATH)
-#include <vectormath_aos.h>
-#else
-#include "BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h"
-#endif
+#include "vectormath_aos.h"
+#include "../PlatformDefinitions.h"
 
 
 
-using namespace Vectormath::Aos;
 
 enum FeatureType { F, E, V };
 
@@ -44,21 +40,21 @@ enum FeatureType { F, E, V };
 class Box
 {
 public:
-	Vector3 half;
+	vmVector3 mHalf;
 
 	inline Box()
 	{}
-	inline Box(PE_REF(Vector3) half_);
+	inline Box(PE_REF(vmVector3) half_);
 	inline Box(float hx, float hy, float hz);
 
-	inline void Set(PE_REF(Vector3) half_);
+	inline void Set(PE_REF(vmVector3) half_);
 	inline void Set(float hx, float hy, float hz);
 
-	inline Vector3 GetAABB(const Matrix3& rotation) const;
+	inline vmVector3 GetAABB(const vmMatrix3& rotation) const;
 };
 
 inline
-Box::Box(PE_REF(Vector3) half_)
+Box::Box(PE_REF(vmVector3) half_)
 {
 	Set(half_);
 }
@@ -71,23 +67,23 @@ Box::Box(float hx, float hy, float hz)
 
 inline
 void
-Box::Set(PE_REF(Vector3) half_)
+Box::Set(PE_REF(vmVector3) half_)
 {
-	half = half_;
+	mHalf = half_;
 }
 
 inline
 void
 Box::Set(float hx, float hy, float hz)
 {
-	half = Vector3(hx, hy, hz);
+	mHalf = vmVector3(hx, hy, hz);
 }
 
 inline
-Vector3
-Box::GetAABB(const Matrix3& rotation) const
+vmVector3
+Box::GetAABB(const vmMatrix3& rotation) const
 {
-	return absPerElem(rotation) * half;
+	return absPerElem(rotation) * mHalf;
 }
 
 //-------------------------------------------------------------------------------------------------
@@ -100,7 +96,7 @@ class BoxPoint
 public:
 	BoxPoint() : localPoint(0.0f) {}
 
-	Point3      localPoint;
+	vmPoint3      localPoint;
 	FeatureType featureType;
 	int         featureIdx;
 
diff --git a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
index 8e540d929..a71095380 100644
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
@@ -22,7 +22,7 @@ subject to the following restrictions:
 #include <stdio.h>
 #define spu_printf printf
 #endif
-#endif DEBUG_SPU_COLLISION_DETECTION
+#endif //DEBUG_SPU_COLLISION_DETECTION
 
 SpuContactResult::SpuContactResult()
 {
diff --git a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
index 80a98ffc7..dfab98028 100644
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
@@ -52,7 +52,7 @@ subject to the following restrictions:
 #ifdef __SPU__
 ///Software caching from the IBM Cell SDK, it reduces 25% SPU time for our test cases
 #ifndef USE_LIBSPE2
-#define USE_SOFTWARE_CACHE 1
+//#define USE_SOFTWARE_CACHE 1
 #endif
 #endif //__SPU__
 
@@ -1216,7 +1216,7 @@ void	processCollisionTask(void* userPtr, void* lsMemPtr)
 #endif											
 										)
 										{
-//#define USE_PE_BOX_BOX 1
+#define USE_PE_BOX_BOX 1
 #ifdef USE_PE_BOX_BOX
 											{
 
@@ -1225,38 +1225,64 @@ void	processCollisionTask(void* userPtr, void* lsMemPtr)
 												btScalar margin1 = lsMem.getlocalCollisionAlgorithm()->getCollisionMargin1();
 												btVector3 shapeDim0 = lsMem.getlocalCollisionAlgorithm()->getShapeDimensions0()+btVector3(margin0,margin0,margin0);
 												btVector3 shapeDim1 = lsMem.getlocalCollisionAlgorithm()->getShapeDimensions1()+btVector3(margin1,margin1,margin1);
+/*
+												//Box boxA(shapeDim0.getX(),shapeDim0.getY(),shapeDim0.getZ());
+												vmVector3 vmPos0 = getVmVector3(collisionPairInput.m_worldTransform0.getOrigin());
+												vmVector3 vmPos1 = getVmVector3(collisionPairInput.m_worldTransform1.getOrigin());
+												vmMatrix3 vmMatrix0 = getVmMatrix3(collisionPairInput.m_worldTransform0.getBasis());
+												vmMatrix3 vmMatrix1 = getVmMatrix3(collisionPairInput.m_worldTransform1.getBasis());
 
-												Box boxA(shapeDim0.getX(),shapeDim0.getY(),shapeDim0.getZ());
-												Vector3 vmPos0 = getVmVector3(collisionPairInput.m_worldTransform0.getOrigin());
-												Vector3 vmPos1 = getVmVector3(collisionPairInput.m_worldTransform1.getOrigin());
-												Matrix3 vmMatrix0 = getVmMatrix3(collisionPairInput.m_worldTransform0.getBasis());
-												Matrix3 vmMatrix1 = getVmMatrix3(collisionPairInput.m_worldTransform1.getBasis());
-
-												Transform3 transformA(vmMatrix0,vmPos0);
+												vmTransform3 transformA(vmMatrix0,vmPos0);
 												Box boxB(shapeDim1.getX(),shapeDim1.getY(),shapeDim1.getZ());
-												Transform3 transformB(vmMatrix1,vmPos1);
+												vmTransform3 transformB(vmMatrix1,vmPos1);
 												BoxPoint resultClosestBoxPointA;
 												BoxPoint resultClosestBoxPointB;
-												Vector3 resultNormal;
+												vmVector3 resultNormal;
+												*/
+
 #ifdef USE_SEPDISTANCE_UTIL
 												float distanceThreshold = FLT_MAX
 #else
-												float distanceThreshold = 0.f;
+												//float distanceThreshold = 0.f;
 #endif
 
 
-												distance = boxBoxDistance(resultNormal,resultClosestBoxPointA,resultClosestBoxPointB,  boxA, transformA, boxB,transformB,distanceThreshold);
+												vmVector3 n;
+												Box boxA;
+												vmVector3 hA(shapeDim0.getX(),shapeDim0.getY(),shapeDim0.getZ());
+												vmVector3 hB(shapeDim1.getX(),shapeDim1.getY(),shapeDim1.getZ());
+												boxA.mHalf= hA;
+												vmTransform3 trA;
+												trA.setTranslation(getVmVector3(collisionPairInput.m_worldTransform0.getOrigin()));
+												trA.setUpper3x3(getVmMatrix3(collisionPairInput.m_worldTransform0.getBasis()));
+												Box boxB;
+												boxB.mHalf = hB;
+												vmTransform3 trB;
+												trB.setTranslation(getVmVector3(collisionPairInput.m_worldTransform1.getOrigin()));
+												trB.setUpper3x3(getVmMatrix3(collisionPairInput.m_worldTransform1.getBasis()));
 												
-												normalInB = -getBtVector3(resultNormal);
+												float distanceThreshold = spuManifold->getContactBreakingThreshold();//0.001f;
 
-												if(distance < spuManifold->getContactBreakingThreshold())
+
+												BoxPoint ptA,ptB;
+												float dist = boxBoxDistance(n, ptA, ptB,
+														   boxA, trA, boxB,	   trB,
+															distanceThreshold );
+
+
+//												float distance = boxBoxDistance(resultNormal,resultClosestBoxPointA,resultClosestBoxPointB,  boxA, transformA, boxB,transformB,distanceThreshold);
+												
+												normalInB = -getBtVector3(n);//resultNormal);
+
+												//if(dist < distanceThreshold)//spuManifold->getContactBreakingThreshold())
+												if(dist < spuManifold->getContactBreakingThreshold())
 												{
-													btVector3 pointOnB = collisionPairInput.m_worldTransform1(getBtVector3(resultClosestBoxPointB.localPoint));
+													btVector3 pointOnB = collisionPairInput.m_worldTransform1(getBtVector3(ptB.localPoint));
 
 													spuContacts.addContactPoint(
 														normalInB,
 														pointOnB,
-														distance);
+														dist);
 												}
 											} 
 #else									
diff --git a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp
index 30642a392..5e1202c01 100644
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.cpp
@@ -15,7 +15,11 @@ subject to the following restrictions:
 */
 
 
-#include "Box.h"
+//#include "PfxContactBoxBox.h"
+
+#include <math.h>
+#include "../PlatformDefinitions.h"
+#include "boxBoxDistance.h"
 
 static inline float sqr( float a )
 {
@@ -114,18 +118,18 @@ VertexBFaceATest(
 	bool & inVoronoi,
 	float & t0,
 	float & t1,
-	const Vector3 & hA,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsB,
-	PE_REF(Vector3) scalesB )
+	const vmVector3 & hA,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsB,
+	PE_REF(vmVector3) scalesB )
 {
 	// compute a corner of box B in A's coordinate system
 
-	Vector3 corner =
-		Vector3( faceOffsetAB + matrixAB.getCol0() * scalesB.getX() + matrixAB.getCol1() * scalesB.getY() );
+	vmVector3 corner =
+		vmVector3( faceOffsetAB + matrixAB.getCol0() * scalesB.getX() + matrixAB.getCol1() * scalesB.getY() );
 
 	// compute the parameters of the point on A, closest to this corner
 
@@ -144,8 +148,8 @@ VertexBFaceATest(
 	// do the Voronoi test: already know the point on B is in the Voronoi region of the
 	// point on A, check the reverse.
 
-	Vector3 facePointB =
-		Vector3( mulPerElem( faceOffsetBA + matrixBA.getCol0() * t0 + matrixBA.getCol1() * t1 - scalesB, signsB ) );
+	vmVector3 facePointB =
+		vmVector3( mulPerElem( faceOffsetBA + matrixBA.getCol0() * t0 + matrixBA.getCol1() * t1 - scalesB, signsB ) );
 
 	inVoronoi = ( ( facePointB[0] >= voronoiTol * facePointB[2] ) &&
 				  ( facePointB[1] >= voronoiTol * facePointB[0] ) &&
@@ -169,17 +173,17 @@ void
 VertexBFaceATests(
 	bool & done,
 	float & minDistSqr,
-	Point3 & localPointA,
-	Point3 & localPointB,
+	vmPoint3 & localPointA,
+	vmPoint3 & localPointB,
 	FeatureType & featureA,
 	FeatureType & featureB,
-	const Vector3 & hA,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsB,
-	PE_REF(Vector3) scalesB,
+	const vmVector3 & hA,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsB,
+	PE_REF(vmVector3) scalesB,
 	bool first )
 {
 		
@@ -247,16 +251,16 @@ VertexAFaceBTest(
 	bool & inVoronoi,
 	float & t0,
 	float & t1,
-	const Vector3 & hB,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsA,
-	PE_REF(Vector3) scalesA )
+	const vmVector3 & hB,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsA,
+	PE_REF(vmVector3) scalesA )
 {
-	Vector3 corner =
-		Vector3( faceOffsetBA + matrixBA.getCol0() * scalesA.getX() + matrixBA.getCol1() * scalesA.getY() );
+	vmVector3 corner =
+		vmVector3( faceOffsetBA + matrixBA.getCol0() * scalesA.getX() + matrixBA.getCol1() * scalesA.getY() );
 
 	t0 = corner[0];
 	t1 = corner[1];
@@ -270,8 +274,8 @@ VertexAFaceBTest(
 	else if ( t1 < -hB[1] )
 		t1 = -hB[1];
 
-	Vector3 facePointA =
-		Vector3( mulPerElem( faceOffsetAB + matrixAB.getCol0() * t0 + matrixAB.getCol1() * t1 - scalesA, signsA ) );
+	vmVector3 facePointA =
+		vmVector3( mulPerElem( faceOffsetAB + matrixAB.getCol0() * t0 + matrixAB.getCol1() * t1 - scalesA, signsA ) );
 
 	inVoronoi = ( ( facePointA[0] >= voronoiTol * facePointA[2] ) &&
 				  ( facePointA[1] >= voronoiTol * facePointA[0] ) &&
@@ -295,17 +299,17 @@ void
 VertexAFaceBTests(
 	bool & done,
 	float & minDistSqr,
-	Point3 & localPointA,
-	Point3 & localPointB,
+	vmPoint3 & localPointA,
+	vmPoint3 & localPointB,
 	FeatureType & featureA,
 	FeatureType & featureB,
-	const Vector3 & hB,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsA,
-	PE_REF(Vector3) scalesA,
+	const vmVector3 & hB,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsA,
+	PE_REF(vmVector3) scalesA,
 	bool first )
 {
 	float t0, t1;
@@ -363,7 +367,7 @@ VertexAFaceBTests(
 }
 
 //-------------------------------------------------------------------------------------------------
-// EdgeEdgeTest:
+// CustomEdgeEdgeTest:
 //
 // tests whether a pair of edges are the closest features
 //
@@ -374,10 +378,10 @@ VertexAFaceBTests(
 // the dimension of the face normal is 2
 //-------------------------------------------------------------------------------------------------
 
-#define EdgeEdgeTest( ac, ac_letter, ad, ad_letter, bc, bc_letter, bd, bd_letter )              \
+#define CustomEdgeEdgeTest( ac, ac_letter, ad, ad_letter, bc, bc_letter, bd, bd_letter )              \
 {                                                                                               \
-   Vector3 edgeOffsetAB;                                                                          \
-   Vector3 edgeOffsetBA;                                                                          \
+   vmVector3 edgeOffsetAB;                                                                          \
+   vmVector3 edgeOffsetBA;                                                                          \
                                                                                                 \
    edgeOffsetAB = faceOffsetAB + matrixAB.getCol##bc() * scalesB.get##bc_letter();            \
    edgeOffsetAB.set##ac_letter( edgeOffsetAB.get##ac_letter() - scalesA.get##ac_letter() );  \
@@ -421,8 +425,8 @@ VertexAFaceBTests(
       else if ( tA > hA[ad] ) tA = hA[ad];                                                      \
    }                                                                                            \
                                                                                                 \
-   Vector3 edgeOffAB = Vector3( mulPerElem( edgeOffsetAB + matrixAB.getCol##bd() * tB, signsA ) );\
-   Vector3 edgeOffBA = Vector3( mulPerElem( edgeOffsetBA + matrixBA.getCol##ad() * tA, signsB ) );\
+   vmVector3 edgeOffAB = vmVector3( mulPerElem( edgeOffsetAB + matrixAB.getCol##bd() * tB, signsA ) );\
+   vmVector3 edgeOffBA = vmVector3( mulPerElem( edgeOffsetBA + matrixBA.getCol##ad() * tA, signsB ) );\
                                                                                                 \
    inVoronoi = ( edgeOffAB[ac] >= voronoiTol * edgeOffAB[2] ) &&                                \
                ( edgeOffAB[2] >= voronoiTol * edgeOffAB[ac] ) &&                                \
@@ -436,79 +440,79 @@ VertexAFaceBTests(
 }
 
 float
-EdgeEdgeTest_0101(
+CustomEdgeEdgeTest_0101(
 	bool & inVoronoi,
 	float & tA,
 	float & tB,
-	const Vector3 & hA,
-	const Vector3 & hB,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsA,
-	PE_REF(Vector3) signsB,
-	PE_REF(Vector3) scalesA,
-	PE_REF(Vector3) scalesB )
+	const vmVector3 & hA,
+	const vmVector3 & hB,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsA,
+	PE_REF(vmVector3) signsB,
+	PE_REF(vmVector3) scalesA,
+	PE_REF(vmVector3) scalesB )
 {
-	EdgeEdgeTest( 0, X, 1, Y, 0, X, 1, Y );
+	CustomEdgeEdgeTest( 0, X, 1, Y, 0, X, 1, Y );
 }
 
 float
-EdgeEdgeTest_0110(
+CustomEdgeEdgeTest_0110(
 	bool & inVoronoi,
 	float & tA,
 	float & tB,
-	const Vector3 & hA,
-	const Vector3 & hB,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsA,
-	PE_REF(Vector3) signsB,
-	PE_REF(Vector3) scalesA,
-	PE_REF(Vector3) scalesB )
+	const vmVector3 & hA,
+	const vmVector3 & hB,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsA,
+	PE_REF(vmVector3) signsB,
+	PE_REF(vmVector3) scalesA,
+	PE_REF(vmVector3) scalesB )
 {
-	EdgeEdgeTest( 0, X, 1, Y, 1, Y, 0, X );
+	CustomEdgeEdgeTest( 0, X, 1, Y, 1, Y, 0, X );
 }
 
 float
-EdgeEdgeTest_1001(
+CustomEdgeEdgeTest_1001(
 	bool & inVoronoi,
 	float & tA,
 	float & tB,
-	const Vector3 & hA,
-	const Vector3 & hB,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsA,
-	PE_REF(Vector3) signsB,
-	PE_REF(Vector3) scalesA,
-	PE_REF(Vector3) scalesB )
+	const vmVector3 & hA,
+	const vmVector3 & hB,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsA,
+	PE_REF(vmVector3) signsB,
+	PE_REF(vmVector3) scalesA,
+	PE_REF(vmVector3) scalesB )
 {
-	EdgeEdgeTest( 1, Y, 0, X, 0, X, 1, Y );
+	CustomEdgeEdgeTest( 1, Y, 0, X, 0, X, 1, Y );
 }
 
 float
-EdgeEdgeTest_1010(
+CustomEdgeEdgeTest_1010(
 	bool & inVoronoi,
 	float & tA,
 	float & tB,
-	const Vector3 & hA,
-	const Vector3 & hB,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsA,
-	PE_REF(Vector3) signsB,
-	PE_REF(Vector3) scalesA,
-	PE_REF(Vector3) scalesB )
+	const vmVector3 & hA,
+	const vmVector3 & hB,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsA,
+	PE_REF(vmVector3) signsB,
+	PE_REF(vmVector3) scalesA,
+	PE_REF(vmVector3) scalesB )
 {
-	EdgeEdgeTest( 1, Y, 0, X, 1, Y, 0, X );
+	CustomEdgeEdgeTest( 1, Y, 0, X, 1, Y, 0, X );
 }
 
 #define EdgeEdge_SetNewMin( ac_letter, ad_letter, bc_letter, bd_letter )   \
@@ -528,22 +532,22 @@ void
 EdgeEdgeTests(
 	bool & done,
 	float & minDistSqr,
-	Point3 & localPointA,
-	Point3 & localPointB,
+	vmPoint3 & localPointA,
+	vmPoint3 & localPointB,
 	int & otherFaceDimA,
 	int & otherFaceDimB,
 	FeatureType & featureA,
 	FeatureType & featureB,
-	const Vector3 & hA,
-	const Vector3 & hB,
-	PE_REF(Vector3) faceOffsetAB,
-	PE_REF(Vector3) faceOffsetBA,
-	const Matrix3 & matrixAB,
-	const Matrix3 & matrixBA,
-	PE_REF(Vector3) signsA,
-	PE_REF(Vector3) signsB,
-	PE_REF(Vector3) scalesA,
-	PE_REF(Vector3) scalesB,
+	const vmVector3 & hA,
+	const vmVector3 & hB,
+	PE_REF(vmVector3) faceOffsetAB,
+	PE_REF(vmVector3) faceOffsetBA,
+	const vmMatrix3 & matrixAB,
+	const vmMatrix3 & matrixBA,
+	PE_REF(vmVector3) signsA,
+	PE_REF(vmVector3) signsB,
+	PE_REF(vmVector3) scalesA,
+	PE_REF(vmVector3) scalesB,
 	bool first )
 {
 
@@ -555,7 +559,7 @@ EdgeEdgeTests(
 	testOtherFaceDimA = 0;
 	testOtherFaceDimB = 0;
 
-	distSqr = EdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( first ) {
@@ -572,7 +576,7 @@ EdgeEdgeTests(
 	signsA.setX( -signsA.getX() );
 	scalesA.setX( -scalesA.getX() );
 
-	distSqr = EdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -585,7 +589,7 @@ EdgeEdgeTests(
 	signsB.setX( -signsB.getX() );
 	scalesB.setX( -scalesB.getX() );
 
-	distSqr = EdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -598,7 +602,7 @@ EdgeEdgeTests(
 	signsA.setX( -signsA.getX() );
 	scalesA.setX( -scalesA.getX() );
 
-	distSqr = EdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0101( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -613,7 +617,7 @@ EdgeEdgeTests(
 	signsB.setX( -signsB.getX() );
 	scalesB.setX( -scalesB.getX() );
 
-	distSqr = EdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -626,7 +630,7 @@ EdgeEdgeTests(
 	signsA.setY( -signsA.getY() );
 	scalesA.setY( -scalesA.getY() );
 
-	distSqr = EdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -639,7 +643,7 @@ EdgeEdgeTests(
 	signsB.setX( -signsB.getX() );
 	scalesB.setX( -scalesB.getX() );
 
-	distSqr = EdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -652,7 +656,7 @@ EdgeEdgeTests(
 	signsA.setY( -signsA.getY() );
 	scalesA.setY( -scalesA.getY() );
 
-	distSqr = EdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1001( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -667,7 +671,7 @@ EdgeEdgeTests(
 	signsB.setX( -signsB.getX() );
 	scalesB.setX( -scalesB.getX() );
 
-	distSqr = EdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -680,7 +684,7 @@ EdgeEdgeTests(
 	signsA.setX( -signsA.getX() );
 	scalesA.setX( -scalesA.getX() );
 
-	distSqr = EdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -693,7 +697,7 @@ EdgeEdgeTests(
 	signsB.setY( -signsB.getY() );
 	scalesB.setY( -scalesB.getY() );
 
-	distSqr = EdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -706,7 +710,7 @@ EdgeEdgeTests(
 	signsA.setX( -signsA.getX() );
 	scalesA.setX( -scalesA.getX() );
 
-	distSqr = EdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_0110( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -721,7 +725,7 @@ EdgeEdgeTests(
 	signsB.setY( -signsB.getY() );
 	scalesB.setY( -scalesB.getY() );
 
-	distSqr = EdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -734,7 +738,7 @@ EdgeEdgeTests(
 	signsA.setY( -signsA.getY() );
 	scalesA.setY( -scalesA.getY() );
 
-	distSqr = EdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -747,7 +751,7 @@ EdgeEdgeTests(
 	signsB.setY( -signsB.getY() );
 	scalesB.setY( -scalesB.getY() );
 
-	distSqr = EdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -760,7 +764,7 @@ EdgeEdgeTests(
 	signsA.setY( -signsA.getY() );
 	scalesA.setY( -scalesA.getY() );
 
-	distSqr = EdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
+	distSqr = CustomEdgeEdgeTest_1010( done, tA, tB, hA, hB, faceOffsetAB, faceOffsetBA,
 								 matrixAB, matrixBA, signsA, signsB, scalesA, scalesB );
 
 	if ( distSqr < minDistSqr ) {
@@ -768,27 +772,25 @@ EdgeEdgeTests(
 	}
 }
 
+
 float
-boxBoxDistance(
-	Vector3& normal,
-	BoxPoint& boxPointA,
-	BoxPoint& boxPointB,
-	PE_REF(Box) boxA, const Transform3& transformA,
-	PE_REF(Box) boxB, const Transform3& transformB,
-	float distanceThreshold )
+boxBoxDistance(vmVector3& normal, BoxPoint& boxPointA, BoxPoint& boxPointB,
+			   PE_REF(Box) boxA, const vmTransform3 & transformA, PE_REF(Box) boxB,
+			   const vmTransform3 & transformB,
+			   float distanceThreshold)
 {
-	Matrix3 identity;
-	identity = Matrix3::identity();
-	Vector3 ident[3];
+	vmMatrix3 identity;
+	identity = vmMatrix3::identity();
+	vmVector3 ident[3];
 	ident[0] = identity.getCol0();
 	ident[1] = identity.getCol1();
 	ident[2] = identity.getCol2();
 
 	// get relative transformations
 
-	Transform3 transformAB, transformBA;
-	Matrix3 matrixAB, matrixBA;
-	Vector3 offsetAB, offsetBA;
+	vmTransform3 transformAB, transformBA;
+	vmMatrix3 matrixAB, matrixBA;
+	vmVector3 offsetAB, offsetBA;
 
 	transformAB = orthoInverse(transformA) * transformB;
 	transformBA = orthoInverse(transformAB);
@@ -798,25 +800,25 @@ boxBoxDistance(
 	matrixBA = transformBA.getUpper3x3();
 	offsetBA = transformBA.getTranslation();
 
-	Matrix3 absMatrixAB = absPerElem(matrixAB);
-	Matrix3 absMatrixBA = absPerElem(matrixBA);
+	vmMatrix3 absMatrixAB = absPerElem(matrixAB);
+	vmMatrix3 absMatrixBA = absPerElem(matrixBA);
 
 	// find separating axis with largest gap between projections
 
 	BoxSepAxisType axisType;
-	Vector3 axisA(0.0f), axisB(0.0f);
+	vmVector3 axisA(0.0f), axisB(0.0f);
 	float gap, maxGap;
 	int faceDimA = 0, faceDimB = 0, edgeDimA = 0, edgeDimB = 0;
 
 	// face axes
 
-	Vector3  gapsA   = absPerElem(offsetAB) - boxA.half - absMatrixAB * boxB.half;
+	vmVector3  gapsA   = absPerElem(offsetAB) - boxA.mHalf - absMatrixAB * boxB.mHalf;
 
 	AaxisTest(0,X,true);
 	AaxisTest(1,Y,false);
 	AaxisTest(2,Z,false);
 
-	Vector3  gapsB   = absPerElem(offsetBA) - boxB.half - absMatrixBA * boxA.half;
+	vmVector3  gapsB   = absPerElem(offsetBA) - boxB.mHalf - absMatrixBA * boxA.mHalf;
 
 	BaxisTest(0,X);
 	BaxisTest(1,Y);
@@ -825,10 +827,10 @@ boxBoxDistance(
 	// cross product axes
 
 	// �O�ς��O�̂Ƃ��̑΍�
-	absMatrixAB += Matrix3(1.0e-5f);
-	absMatrixBA += Matrix3(1.0e-5f);
+	absMatrixAB += vmMatrix3(1.0e-5f);
+	absMatrixBA += vmMatrix3(1.0e-5f);
 
-	Matrix3 lsqrs, projOffset, projAhalf, projBhalf;
+	vmMatrix3 lsqrs, projOffset, projAhalf, projBhalf;
 
 	lsqrs.setCol0( mulPerElem( matrixBA.getCol2(), matrixBA.getCol2() ) +
 				   mulPerElem( matrixBA.getCol1(), matrixBA.getCol1() ) );
@@ -841,15 +843,15 @@ boxBoxDistance(
 	projOffset.setCol1(matrixBA.getCol2() * offsetAB.getX() - matrixBA.getCol0() * offsetAB.getZ());
 	projOffset.setCol2(matrixBA.getCol0() * offsetAB.getY() - matrixBA.getCol1() * offsetAB.getX());
 
-	projAhalf.setCol0(absMatrixBA.getCol1() * boxA.half.getZ() + absMatrixBA.getCol2() * boxA.half.getY());
-	projAhalf.setCol1(absMatrixBA.getCol2() * boxA.half.getX() + absMatrixBA.getCol0() * boxA.half.getZ());
-	projAhalf.setCol2(absMatrixBA.getCol0() * boxA.half.getY() + absMatrixBA.getCol1() * boxA.half.getX());
+	projAhalf.setCol0(absMatrixBA.getCol1() * boxA.mHalf.getZ() + absMatrixBA.getCol2() * boxA.mHalf.getY());
+	projAhalf.setCol1(absMatrixBA.getCol2() * boxA.mHalf.getX() + absMatrixBA.getCol0() * boxA.mHalf.getZ());
+	projAhalf.setCol2(absMatrixBA.getCol0() * boxA.mHalf.getY() + absMatrixBA.getCol1() * boxA.mHalf.getX());
 
-	projBhalf.setCol0(absMatrixAB.getCol1() * boxB.half.getZ() + absMatrixAB.getCol2() * boxB.half.getY());
-	projBhalf.setCol1(absMatrixAB.getCol2() * boxB.half.getX() + absMatrixAB.getCol0() * boxB.half.getZ());
-	projBhalf.setCol2(absMatrixAB.getCol0() * boxB.half.getY() + absMatrixAB.getCol1() * boxB.half.getX());
+	projBhalf.setCol0(absMatrixAB.getCol1() * boxB.mHalf.getZ() + absMatrixAB.getCol2() * boxB.mHalf.getY());
+	projBhalf.setCol1(absMatrixAB.getCol2() * boxB.mHalf.getX() + absMatrixAB.getCol0() * boxB.mHalf.getZ());
+	projBhalf.setCol2(absMatrixAB.getCol0() * boxB.mHalf.getY() + absMatrixAB.getCol1() * boxB.mHalf.getX());
 
-	Matrix3 gapsAxB = absPerElem(projOffset) - projAhalf - transpose(projBhalf);
+	vmMatrix3 gapsAxB = absPerElem(projOffset) - projAhalf - transpose(projBhalf);
 
 	CrossAxisTest(0,0,X);
 	CrossAxisTest(0,1,Y);
@@ -872,7 +874,7 @@ boxBoxDistance(
 			axisA = -axisA;
 		axisB = matrixBA * -axisA;
 
-		Vector3 absAxisB = Vector3(absPerElem(axisB));
+		vmVector3 absAxisB = vmVector3(absPerElem(axisB));
 
 		if ( ( absAxisB[0] > absAxisB[1] ) && ( absAxisB[0] > absAxisB[2] ) )
 			faceDimB = 0;
@@ -885,7 +887,7 @@ boxBoxDistance(
 			axisB = -axisB;
 		axisA = matrixAB * -axisB;
 
-		Vector3 absAxisA = Vector3(absPerElem(axisA));
+		vmVector3 absAxisA = vmVector3(absPerElem(axisA));
 
 		if ( ( absAxisA[0] > absAxisA[1] ) && ( absAxisA[0] > absAxisA[2] ) )
 			faceDimA = 0;
@@ -900,8 +902,8 @@ boxBoxDistance(
 			axisA = -axisA;
 		axisB = matrixBA * -axisA;
 
-		Vector3 absAxisA = Vector3(absPerElem(axisA));
-		Vector3 absAxisB = Vector3(absPerElem(axisB));
+		vmVector3 absAxisA = vmVector3(absPerElem(axisA));
+		vmVector3 absAxisB = vmVector3(absPerElem(axisB));
 
 		dimA[1] = edgeDimA;
 		dimB[1] = edgeDimB;
@@ -966,7 +968,7 @@ boxBoxDistance(
 		dimB[1] = (faceDimB+2)%3;
 	}
 
-	Matrix3 aperm_col, bperm_col;
+	vmMatrix3 aperm_col, bperm_col;
 
 	aperm_col.setCol0(ident[dimA[0]]);
 	aperm_col.setCol1(ident[dimA[1]]);
@@ -976,32 +978,32 @@ boxBoxDistance(
 	bperm_col.setCol1(ident[dimB[1]]);
 	bperm_col.setCol2(ident[dimB[2]]);
 
-	Matrix3 aperm_row, bperm_row;
+	vmMatrix3 aperm_row, bperm_row;
 
 	aperm_row = transpose(aperm_col);
 	bperm_row = transpose(bperm_col);
 
 	// permute all box parameters to be in the face coordinate systems
 
-	Matrix3 matrixAB_perm = aperm_row * matrixAB * bperm_col;
-	Matrix3 matrixBA_perm = transpose(matrixAB_perm);
+	vmMatrix3 matrixAB_perm = aperm_row * matrixAB * bperm_col;
+	vmMatrix3 matrixBA_perm = transpose(matrixAB_perm);
 
-	Vector3 offsetAB_perm, offsetBA_perm;
+	vmVector3 offsetAB_perm, offsetBA_perm;
 
 	offsetAB_perm = aperm_row * offsetAB;
 	offsetBA_perm = bperm_row * offsetBA;
 
-	Vector3 halfA_perm, halfB_perm;
+	vmVector3 halfA_perm, halfB_perm;
 
-	halfA_perm = aperm_row * boxA.half;
-	halfB_perm = bperm_row * boxB.half;
+	halfA_perm = aperm_row * boxA.mHalf;
+	halfB_perm = bperm_row * boxB.mHalf;
 
 	// compute the vector between the centers of each face, in each face's coordinate frame
 
-	Vector3 signsA_perm, signsB_perm, scalesA_perm, scalesB_perm, faceOffsetAB_perm, faceOffsetBA_perm;
+	vmVector3 signsA_perm, signsB_perm, scalesA_perm, scalesB_perm, faceOffsetAB_perm, faceOffsetBA_perm;
 
-	signsA_perm = copySignPerElem(Vector3(1.0f),aperm_row * axisA);
-	signsB_perm = copySignPerElem(Vector3(1.0f),bperm_row * axisB);
+	signsA_perm = copySignPerElem(vmVector3(1.0f),aperm_row * axisA);
+	signsB_perm = copySignPerElem(vmVector3(1.0f),bperm_row * axisB);
 	scalesA_perm = mulPerElem( signsA_perm, halfA_perm );
 	scalesB_perm = mulPerElem( signsB_perm, halfB_perm );
 
@@ -1031,11 +1033,11 @@ boxBoxDistance(
 	// if for some reason no case passes the Voronoi test, the features with the minimum
 	// distance are returned.
 
-	Point3 localPointA_perm, localPointB_perm;
+	vmPoint3 localPointA_perm, localPointB_perm;
 	float minDistSqr;
 	bool done;
 
-	Vector3 hA_perm( halfA_perm ), hB_perm( halfB_perm );
+	vmVector3 hA_perm( halfA_perm ), hB_perm( halfB_perm );
 
 	localPointA_perm.setZ( scalesA_perm.getZ() );
 	localPointB_perm.setZ( scalesB_perm.getZ() );
@@ -1109,9 +1111,11 @@ boxBoxDistance(
 
 	// convert local points from face-local to box-local coordinate system
 
-	boxPointA.localPoint = Point3( aperm_col * Vector3( localPointA_perm ) );
-	boxPointB.localPoint = Point3( bperm_col * Vector3( localPointB_perm ) );
+	
+	boxPointA.localPoint = vmPoint3( aperm_col * vmVector3( localPointA_perm )) ;
+	boxPointB.localPoint = vmPoint3( bperm_col * vmVector3( localPointB_perm )) ;
 
+#if 0
 	// find which features of the boxes are involved.
 	// the only feature pairs which occur in this function are VF, FV, and EE, even though the
 	// closest points might actually lie on sub-features, as in a VF contact might be used for
@@ -1144,6 +1148,7 @@ boxBoxDistance(
 	} else {
 		boxPointB.setVertexFeature( sB[0], sB[1], sB[2] );
 	}
+#endif
 
 	normal = transformA * axisA;
 
diff --git a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h
index c58e257c0..0d4957dea 100644
--- a/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h
+++ b/src/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/boxBoxDistance.h
@@ -21,7 +21,6 @@ subject to the following restrictions:
 
 #include "Box.h"
 
-using namespace Vectormath::Aos;
 
 //---------------------------------------------------------------------------
 // boxBoxDistance:
@@ -38,7 +37,7 @@ using namespace Vectormath::Aos;
 //    positive or negative distance between two boxes.
 //
 // args:
-//    Vector3& normal: set to a unit contact normal pointing from box A to box B.
+//    vmVector3& normal: set to a unit contact normal pointing from box A to box B.
 //
 //    BoxPoint& boxPointA, BoxPoint& boxPointB:
 //       set to a closest point or point of penetration on each box.
@@ -46,7 +45,7 @@ using namespace Vectormath::Aos;
 //    Box boxA, Box boxB:
 //       boxes, represented as 3 half-widths
 //
-//    const Transform3& transformA, const Transform3& transformB:
+//    const vmTransform3& transformA, const vmTransform3& transformB:
 //       box transformations, in world coordinates
 //
 //    float distanceThreshold:
@@ -58,9 +57,9 @@ using namespace Vectormath::Aos;
 //---------------------------------------------------------------------------
 
 float
-boxBoxDistance(Vector3& normal, BoxPoint& boxPointA, BoxPoint& boxPointB,
-			   PE_REF(Box) boxA, const Transform3 & transformA, PE_REF(Box) boxB,
-			   const Transform3 & transformB,
+boxBoxDistance(vmVector3& normal, BoxPoint& boxPointA, BoxPoint& boxPointB,
+			   PE_REF(Box) boxA, const vmTransform3 & transformA, PE_REF(Box) boxB,
+			   const vmTransform3 & transformB,
 			   float distanceThreshold = FLT_MAX );
 
 #endif /* __BOXBOXDISTANCE_H__ */
diff --git a/src/BulletMultiThreaded/TrbDynBody.h b/src/BulletMultiThreaded/TrbDynBody.h
new file mode 100644
index 000000000..d65c22e69
--- /dev/null
+++ b/src/BulletMultiThreaded/TrbDynBody.h
@@ -0,0 +1,78 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef __T_RB_DYN_BODY_H__
+#define __T_RB_DYN_BODY_H__
+
+#include <vectormath_aos.h>
+using namespace Vectormath::Aos;
+
+#include "TrbStateVec.h"
+
+class CollObject;
+
+class TrbDynBody
+{
+public:
+	TrbDynBody()
+	{
+		fMass   = 0.0f;
+		fCollObject = NULL;
+		fElasticity = 0.2f;
+		fFriction = 0.8f;
+	}
+
+	// Get methods
+	float          getMass() const {return fMass;};
+	float          getElasticity() const {return fElasticity;}
+	float          getFriction() const {return fFriction;}
+	CollObject*    getCollObject() const {return fCollObject;}
+	const Matrix3 &getBodyInertia() const {return fIBody;}
+	const Matrix3 &getBodyInertiaInv() const {return fIBodyInv;}
+	float          getMassInv() const {return fMassInv;}
+
+	// Set methods
+	void           setMass(float mass) {fMass=mass;fMassInv=mass>0.0f?1.0f/mass:0.0f;}
+	void           setBodyInertia(const Matrix3 bodyInertia) {fIBody = bodyInertia;fIBodyInv = inverse(bodyInertia);}
+	void           setElasticity(float elasticity) {fElasticity = elasticity;}
+	void           setFriction(float friction) {fFriction = friction;}
+	void           setCollObject(CollObject *collObj) {fCollObject = collObj;}
+	
+	void           setBodyInertiaInv(const Matrix3 bodyInertiaInv) 
+	{
+		fIBody = inverse(bodyInertiaInv);
+		fIBodyInv = bodyInertiaInv;
+	}
+	void           setMassInv(float invMass) {
+		fMass= invMass>0.0f ? 1.0f/invMass :0.0f;
+		fMassInv=invMass;
+	}
+
+
+private:
+	// Rigid Body constants
+	float          fMass;        // Rigid Body mass
+	float          fMassInv;     // Inverse of mass
+	Matrix3        fIBody;       // Inertia matrix in body's coords
+	Matrix3        fIBodyInv;    // Inertia matrix inverse in body's coords
+	float          fElasticity;  // Coefficient of restitution
+	float          fFriction;    // Coefficient of friction
+
+public:
+	CollObject*    fCollObject;  // Collision object corresponding the RB
+} __attribute__ ((aligned(16)));
+
+#endif /* __T_RB_DYN_BODY_H__ */
diff --git a/src/BulletMultiThreaded/TrbStateVec.h b/src/BulletMultiThreaded/TrbStateVec.h
new file mode 100644
index 000000000..8a76a33d0
--- /dev/null
+++ b/src/BulletMultiThreaded/TrbStateVec.h
@@ -0,0 +1,334 @@
+/*
+   Copyright (C) 2009 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+*/
+
+#ifndef __TRBSTATEVEC_H__
+#define __TRBSTATEVEC_H__
+
+#include <stdlib.h>
+#include <vectormath_aos.h>
+
+
+#include "PlatformDefinitions.h"
+
+
+static inline vmVector3 read_Vector3(const float* p)
+{
+	vmVector3 v;
+	loadXYZ(v, p);
+	return v;
+}
+
+static inline vmQuat read_Quat(const float* p)
+{
+	vmQuat vq;
+	loadXYZW(vq, p);
+	return vq;
+}
+
+static inline void store_Vector3(const vmVector3 &src, float* p)
+{
+	vmVector3 v = src;
+	storeXYZ(v, p);
+}
+
+static inline void store_Quat(const vmQuat &src, float* p)
+{
+	vmQuat vq = src;
+	storeXYZW(vq, p);
+}
+
+// Motion Type
+enum {
+	PfxMotionTypeFixed = 0,
+	PfxMotionTypeActive,
+	PfxMotionTypeKeyframe,
+	PfxMotionTypeOneWay,
+	PfxMotionTypeTrigger,
+	PfxMotionTypeCount
+};
+
+#define PFX_MOTION_MASK_DYNAMIC 0x0a // Active,OneWay
+#define PFX_MOTION_MASK_STATIC  0x95 // Fixed,Keyframe,Trigger,Sleeping
+#define PFX_MOTION_MASK_SLEEP   0x0e // Can sleep
+#define PFX_MOTION_MASK_TYPE    0x7f
+
+//
+// Rigid Body state
+//
+
+#ifdef __CELLOS_LV2__
+ATTRIBUTE_ALIGNED128(class) TrbState
+#else
+ATTRIBUTE_ALIGNED16(class) TrbState
+#endif
+
+{
+public:
+	TrbState()
+	{
+		setMotionType(PfxMotionTypeActive);
+		contactFilterSelf=contactFilterTarget=0xffffffff;
+		deleted = 0;
+		mSleeping = 0;
+		useSleep = 1;
+		trbBodyIdx=0;
+		mSleepCount=0;
+		useCcd = 0;
+		useContactCallback = 0;
+		useSleepCallback = 0;
+		linearDamping = 1.0f;
+		angularDamping = 0.99f;
+	}
+
+	TrbState(const uint8_t m, const vmVector3& x, const vmQuat& q, const vmVector3& v, const vmVector3& omega );
+	
+	uint16_t	mSleepCount;
+	uint8_t		mMotionType;
+	uint8_t		deleted            : 1;
+	uint8_t		mSleeping           : 1;
+	uint8_t		useSleep           : 1;
+	uint8_t		useCcd		       : 1;
+	uint8_t		useContactCallback : 1;
+	uint8_t		useSleepCallback   : 1;
+
+	uint16_t	trbBodyIdx;
+	uint32_t	contactFilterSelf;
+	uint32_t	contactFilterTarget;
+
+	float		center[3];		// AABB center(World)
+	float		half[3];		// AABB half(World)
+
+	float		linearDamping;
+	float		angularDamping;
+	
+	float		deltaLinearVelocity[3];
+	float		deltaAngularVelocity[3];
+
+	float     fX[3];				// position
+	float     fQ[4];				// orientation
+	float     fV[3];				// velocity
+	float     fOmega[3];			// angular velocity
+
+	inline void setZero();      // Zeroes out the elements
+	inline void setIdentity();  // Sets the rotation to identity and zeroes out the other elements
+
+	bool		isDeleted() const {return deleted==1;}
+
+	uint16_t	getRigidBodyId() const {return trbBodyIdx;}
+	void		setRigidBodyId(uint16_t i) {trbBodyIdx = i;}
+
+
+	uint32_t	getContactFilterSelf() const {return contactFilterSelf;}
+	void		setContactFilterSelf(uint32_t filter) {contactFilterSelf = filter;}
+
+	uint32_t	getContactFilterTarget() const {return contactFilterTarget;}
+	void		setContactFilterTarget(uint32_t filter) {contactFilterTarget = filter;}
+
+	float getLinearDamping() const {return linearDamping;}
+	float getAngularDamping() const {return angularDamping;}
+
+	void setLinearDamping(float damping) {linearDamping=damping;}
+	void setAngularDamping(float damping) {angularDamping=damping;}
+
+
+	uint8_t		getMotionType() const {return mMotionType;}
+	void		setMotionType(uint8_t t) {mMotionType = t;mSleeping=0;mSleepCount=0;}
+
+	uint8_t		getMotionMask() const {return (1<<mMotionType)|(mSleeping<<7);}
+
+	bool		isAsleep() const {return mSleeping==1;}
+	bool		isAwake() const {return mSleeping==0;}
+
+	void		wakeup() {mSleeping=0;mSleepCount=0;}
+	void		sleep() {if(useSleep) {mSleeping=1;mSleepCount=0;}}
+
+	uint8_t		getUseSleep() const {return useSleep;}
+	void		setUseSleep(uint8_t b) {useSleep=b;}
+
+	uint8_t		getUseCcd() const {return useCcd;}
+	void		setUseCcd(uint8_t b) {useCcd=b;}
+
+	uint8_t		getUseContactCallback() const {return useContactCallback;}
+	void		setUseContactCallback(uint8_t b) {useContactCallback=b;}
+
+	uint8_t		getUseSleepCallback() const {return useSleepCallback;}
+	void		setUseSleepCallback(uint8_t b) {useSleepCallback=b;}
+
+	void	 	incrementSleepCount() {mSleepCount++;}
+	void		resetSleepCount() {mSleepCount=0;}
+	uint16_t	getSleepCount() const {return mSleepCount;}
+
+	vmVector3 getPosition() const {return read_Vector3(fX);}
+	vmQuat    getOrientation() const {return read_Quat(fQ);}
+	vmVector3 getLinearVelocity() const {return read_Vector3(fV);}
+	vmVector3 getAngularVelocity() const {return read_Vector3(fOmega);}
+	vmVector3 getDeltaLinearVelocity() const {return read_Vector3(deltaLinearVelocity);}
+	vmVector3 getDeltaAngularVelocity() const {return read_Vector3(deltaAngularVelocity);}
+
+	void setPosition(const vmVector3 &pos) {store_Vector3(pos, fX);}
+	void setLinearVelocity(const vmVector3 &vel) {store_Vector3(vel, fV);}
+	void setAngularVelocity(const vmVector3 &vel) {store_Vector3(vel, fOmega);}
+	void setDeltaLinearVelocity(const vmVector3 &vel) {store_Vector3(vel, deltaLinearVelocity);}
+	void setDeltaAngularVelocity(const vmVector3 &vel) {store_Vector3(vel, deltaAngularVelocity);}
+	void setOrientation(const vmQuat &rot) {store_Quat(rot, fQ);}
+
+	inline void setAuxils(const vmVector3 &centerLocal,const vmVector3 &halfLocal);
+	inline void	setAuxilsCcd(const vmVector3 &centerLocal,const vmVector3 &halfLocal,float timeStep);
+	inline	void reset();
+};
+
+inline
+TrbState::TrbState(const uint8_t m, const vmVector3& x, const vmQuat& q, const vmVector3& v, const vmVector3& omega)
+{
+	setMotionType(m);
+	fX[0] = x[0];
+	fX[1] = x[1];
+	fX[2] = x[2];
+	fQ[0] = q[0];
+	fQ[1] = q[1];
+	fQ[2] = q[2];
+	fQ[3] = q[3];
+	fV[0] = v[0];
+	fV[1] = v[1];
+	fV[2] = v[2];
+	fOmega[0] = omega[0];
+	fOmega[1] = omega[1];
+	fOmega[2] = omega[2];
+	contactFilterSelf=contactFilterTarget=0xffff;
+	trbBodyIdx=0;
+	mSleeping = 0;
+	deleted = 0;
+	useSleep = 1;
+	useCcd = 0;
+	useContactCallback = 0;
+	useSleepCallback = 0;
+	mSleepCount=0;
+	linearDamping = 1.0f;
+	angularDamping = 0.99f;
+}
+
+inline void
+TrbState::setIdentity()
+{
+	fX[0] = 0.0f;
+	fX[1] = 0.0f;
+	fX[2] = 0.0f;
+	fQ[0] = 0.0f;
+	fQ[1] = 0.0f;
+	fQ[2] = 0.0f;
+	fQ[3] = 1.0f;
+	fV[0] = 0.0f;
+	fV[1] = 0.0f;
+	fV[2] = 0.0f;
+	fOmega[0] = 0.0f;
+	fOmega[1] = 0.0f;
+	fOmega[2] = 0.0f;
+}
+
+inline void
+TrbState::setZero()
+{
+	fX[0] = 0.0f;
+	fX[1] = 0.0f;
+	fX[2] = 0.0f;
+	fQ[0] = 0.0f;
+	fQ[1] = 0.0f;
+	fQ[2] = 0.0f;
+	fQ[3] = 0.0f;
+	fV[0] = 0.0f;
+	fV[1] = 0.0f;
+	fV[2] = 0.0f;
+	fOmega[0] = 0.0f;
+	fOmega[1] = 0.0f;
+	fOmega[2] = 0.0f;
+}
+
+inline void
+TrbState::setAuxils(const vmVector3 &centerLocal,const vmVector3 &halfLocal)
+{
+	vmVector3 centerW = getPosition() + rotate(getOrientation(),centerLocal);
+	vmVector3 halfW = absPerElem(vmMatrix3(getOrientation())) * halfLocal;
+	center[0] = centerW[0];
+	center[1] = centerW[1];
+	center[2] = centerW[2];
+	half[0] = halfW[0];
+	half[1] = halfW[1];
+	half[2] = halfW[2];
+}
+
+inline void
+TrbState::setAuxilsCcd(const vmVector3 &centerLocal,const vmVector3 &halfLocal,float timeStep)
+{
+	vmVector3 centerW = getPosition() + rotate(getOrientation(),centerLocal);
+	vmVector3 halfW = absPerElem(vmMatrix3(getOrientation())) * halfLocal;
+
+	vmVector3 diffvec = getLinearVelocity()*timeStep;
+
+	vmVector3 newCenter = centerW + diffvec;
+	vmVector3 aabbMin = minPerElem(newCenter - halfW,centerW - halfW);
+	vmVector3 aabbMax = maxPerElem(newCenter + halfW,centerW + halfW);
+	
+	centerW = 0.5f * (aabbMin + aabbMax);
+	halfW =0.5f * (aabbMax - aabbMin);
+
+	center[0] = centerW[0];
+	center[1] = centerW[1];
+	center[2] = centerW[2];
+
+	half[0] = halfW[0];
+	half[1] = halfW[1];
+	half[2] = halfW[2];
+}
+
+inline
+void TrbState::reset()
+{
+#if 0
+	mSleepCount = 0;
+	mMotionType = PfxMotionTypeActive;
+	mDeleted = 0;
+	mSleeping = 0;
+	mUseSleep = 1;
+	mUseCcd = 0;
+	mUseContactCallback = 0;
+	mUseSleepCallback = 0;
+	mRigidBodyId = 0;
+	mContactFilterSelf = 0xffffffff;
+	mContactFilterTarget = 0xffffffff;
+	mLinearDamping = 1.0f;
+	mAngularDamping = 0.99f;
+	mPosition = vmVector3(0.0f);
+	mOrientation = vmQuat::identity();
+	mLinearVelocity = vmVector3(0.0f);
+	mAngularVelocity = vmVector3(0.0f);
+#endif
+
+	setMotionType(PfxMotionTypeActive);
+	contactFilterSelf=contactFilterTarget=0xffffffff;
+	deleted = 0;
+	mSleeping = 0;
+	useSleep = 1;
+	trbBodyIdx=0;
+	mSleepCount=0;
+	useCcd = 0;
+	useContactCallback = 0;
+	useSleepCallback = 0;
+	linearDamping = 1.0f;
+	angularDamping = 0.99f;
+}
+
+#endif /* __TRBSTATEVEC_H__ */
+
diff --git a/src/BulletMultiThreaded/btParallelConstraintSolver.cpp b/src/BulletMultiThreaded/btParallelConstraintSolver.cpp
index 84774b227..b979398e9 100644
--- a/src/BulletMultiThreaded/btParallelConstraintSolver.cpp
+++ b/src/BulletMultiThreaded/btParallelConstraintSolver.cpp
@@ -14,61 +14,1334 @@ subject to the following restrictions:
 
 */
 
+
 #include "btParallelConstraintSolver.h"
 #include "BulletDynamics/ConstraintSolver/btContactSolverInfo.h"
 
-btParallelConstraintSolver::btParallelConstraintSolver()
+#include "LinearMath/btQuickprof.h"
+#include "BulletMultiThreaded/btThreadSupportInterface.h"
+
+#include "vectormath_aos.h"
+
+
+#include "HeapManager.h"
+
+#include "PlatformDefinitions.h"
+
+//#include "PfxSimdUtils.h"
+#include "LinearMath/btScalar.h"
+
+#include "TrbStateVec.h"
+
+
+
+/////////////////
+
+
+#define TMP_BUFF_BYTES (15*1024*1024)
+unsigned char ATTRIBUTE_ALIGNED128(tmp_buff[TMP_BUFF_BYTES]);
+
+
+
+// Project Gauss Seidel or the equivalent Sequential Impulse
+ inline void resolveSingleConstraintRowGeneric(PfxSolverBody& body1,PfxSolverBody& body2,const btSolverConstraint& c)
 {
 
-	//initialize MiniCL here
+	btScalar deltaImpulse = c.m_rhs-btScalar(c.m_appliedImpulse)*c.m_cfm;
+	const btScalar deltaVel1Dotn	=	c.m_contactNormal.dot((btVector3&)body1.mDeltaLinearVelocity) 	+ c.m_relpos1CrossNormal.dot((btVector3&)body1.mDeltaAngularVelocity);
+	const btScalar deltaVel2Dotn	=	-c.m_contactNormal.dot((btVector3&)body2.mDeltaLinearVelocity) + c.m_relpos2CrossNormal.dot((btVector3&)body2.mDeltaAngularVelocity);
 
+//	const btScalar delta_rel_vel	=	deltaVel1Dotn-deltaVel2Dotn;
+	deltaImpulse	-=	deltaVel1Dotn*c.m_jacDiagABInv;
+	deltaImpulse	-=	deltaVel2Dotn*c.m_jacDiagABInv;
+
+	const btScalar sum = btScalar(c.m_appliedImpulse) + deltaImpulse;
+	if (sum < c.m_lowerLimit)
+	{
+		deltaImpulse = c.m_lowerLimit-c.m_appliedImpulse;
+		c.m_appliedImpulse = c.m_lowerLimit;
+	}
+	else if (sum > c.m_upperLimit) 
+	{
+		deltaImpulse = c.m_upperLimit-c.m_appliedImpulse;
+		c.m_appliedImpulse = c.m_upperLimit;
+	}
+	else
+	{
+		c.m_appliedImpulse = sum;
+	}
+	
+
+	if (body1.mMassInv)
+	{
+		btVector3 linearComponent = c.m_contactNormal*body1.mMassInv;
+		((btVector3&)body1.mDeltaLinearVelocity) += linearComponent*deltaImpulse;
+		((btVector3&)body1.mDeltaAngularVelocity) += c.m_angularComponentA*(btVector3(deltaImpulse,deltaImpulse,deltaImpulse));//*m_angularFactor);
+	}
+
+	if (body2.mMassInv)
+	{
+		btVector3 linearComponent = -c.m_contactNormal*body2.mMassInv;
+		((btVector3&)body2.mDeltaLinearVelocity) += linearComponent*deltaImpulse;
+		((btVector3&)body2.mDeltaAngularVelocity) += c.m_angularComponentB*((btVector3(deltaImpulse,deltaImpulse,deltaImpulse)));//*m_angularFactor);
+	}
+
+	//body1.internalApplyImpulse(c.m_contactNormal*body1.internalGetInvMass(),c.m_angularComponentA,deltaImpulse);
+	//body2.internalApplyImpulse(-c.m_contactNormal*body2.internalGetInvMass(),c.m_angularComponentB,deltaImpulse);
+
+}
+
+ 
+static SIMD_FORCE_INLINE
+void pfxSolveLinearConstraintRow(PfxConstraintRow &constraint,
+	vmVector3 &deltaLinearVelocityA,vmVector3 &deltaAngularVelocityA,
+	float massInvA,const vmMatrix3 &inertiaInvA,const vmVector3 &rA,
+	vmVector3 &deltaLinearVelocityB,vmVector3 &deltaAngularVelocityB,
+	float massInvB,const vmMatrix3 &inertiaInvB,const vmVector3 &rB)
+{
+	const vmVector3 normal(btReadVector3(constraint.mNormal));
+	float deltaImpulse = constraint.mRhs;
+	vmVector3 dVA = deltaLinearVelocityA + cross(deltaAngularVelocityA,rA);
+	vmVector3 dVB = deltaLinearVelocityB + cross(deltaAngularVelocityB,rB);
+	deltaImpulse -= constraint.mJacDiagInv * dot(normal,dVA-dVB);
+	float oldImpulse = constraint.mAccumImpulse;
+	constraint.mAccumImpulse = btClamped(oldImpulse + deltaImpulse,constraint.mLowerLimit,constraint.mUpperLimit);
+	deltaImpulse = constraint.mAccumImpulse - oldImpulse;
+	deltaLinearVelocityA += deltaImpulse * massInvA * normal;
+	deltaAngularVelocityA += deltaImpulse * inertiaInvA * cross(rA,normal);
+	deltaLinearVelocityB -= deltaImpulse * massInvB * normal;
+	deltaAngularVelocityB -= deltaImpulse * inertiaInvB * cross(rB,normal);
+
+}
+ 
+void btSolveContactConstraint(
+	PfxConstraintRow &constraintResponse,
+	PfxConstraintRow &constraintFriction1,
+	PfxConstraintRow &constraintFriction2,
+	const vmVector3 &contactPointA,
+	const vmVector3 &contactPointB,
+	PfxSolverBody &solverBodyA,
+	PfxSolverBody &solverBodyB,
+	float friction
+	)
+{
+	vmVector3 rA = rotate(solverBodyA.mOrientation,contactPointA);
+	vmVector3 rB = rotate(solverBodyB.mOrientation,contactPointB);
+
+	pfxSolveLinearConstraintRow(constraintResponse,
+		solverBodyA.mDeltaLinearVelocity,solverBodyA.mDeltaAngularVelocity,solverBodyA.mMassInv,solverBodyA.mInertiaInv,rA,
+		solverBodyB.mDeltaLinearVelocity,solverBodyB.mDeltaAngularVelocity,solverBodyB.mMassInv,solverBodyB.mInertiaInv,rB);
+
+	float mf = friction*fabsf(constraintResponse.mAccumImpulse);
+	constraintFriction1.mLowerLimit = -mf;
+	constraintFriction1.mUpperLimit =  mf;
+	constraintFriction2.mLowerLimit = -mf;
+	constraintFriction2.mUpperLimit =  mf;
+
+	pfxSolveLinearConstraintRow(constraintFriction1,
+		solverBodyA.mDeltaLinearVelocity,solverBodyA.mDeltaAngularVelocity,solverBodyA.mMassInv,solverBodyA.mInertiaInv,rA,
+		solverBodyB.mDeltaLinearVelocity,solverBodyB.mDeltaAngularVelocity,solverBodyB.mMassInv,solverBodyB.mInertiaInv,rB);
+
+	pfxSolveLinearConstraintRow(constraintFriction2,
+		solverBodyA.mDeltaLinearVelocity,solverBodyA.mDeltaAngularVelocity,solverBodyA.mMassInv,solverBodyA.mInertiaInv,rA,
+		solverBodyB.mDeltaLinearVelocity,solverBodyB.mDeltaAngularVelocity,solverBodyB.mMassInv,solverBodyB.mInertiaInv,rB);
+}
+
+
+void CustomSolveConstraintsTaskParallel(
+	const PfxParallelGroup *contactParallelGroup,const PfxParallelBatch *contactParallelBatches,
+	PfxConstraintPair *contactPairs,uint32_t numContactPairs,
+	btPersistentManifold* offsetContactManifolds__,
+	const PfxParallelGroup *jointParallelGroup,const PfxParallelBatch *jointParallelBatches,
+	PfxConstraintPair *jointPairs,uint32_t numJointPairs,
+	TrbState *offsetRigStates,
+	PfxSolverBody *offsetSolverBodies,
+	uint32_t numRigidBodies,
+	int iteration,unsigned int taskId,unsigned int numTasks,btBarrier *barrier)
+{
+
+	PfxSolverBody staticBody;
+	staticBody.mMassInv = 0.f;
+	staticBody.mDeltaAngularVelocity=vmVector3(0,0,0);
+	staticBody.mDeltaLinearVelocity =vmVector3(0,0,0);
+
+
+	for(int k=0;k<iteration+1;k++) {
+		// Joint
+		for(uint32_t phaseId=0;phaseId<jointParallelGroup->numPhases;phaseId++) {
+			for(uint32_t batchId=0;batchId<jointParallelGroup->numBatches[phaseId];batchId++) {
+				uint32_t numPairs = jointParallelGroup->numPairs[phaseId*PFX_MAX_SOLVER_BATCHES+batchId];
+				if(batchId%numTasks == taskId && numPairs > 0) {
+					const PfxParallelBatch &batch = jointParallelBatches[phaseId*PFX_MAX_SOLVER_BATCHES+batchId];
+					for(int i=0;i<numPairs;i++) {
+						PfxConstraintPair &pair = jointPairs[batch.pairIndices[i]];
+						uint16_t iA = pfxGetRigidBodyIdA(pair);
+						uint16_t iB = pfxGetRigidBodyIdB(pair);
+
+						
+						PfxSolverBody &solverBodyA = iA != 65535 ? offsetSolverBodies[iA] : staticBody;
+						PfxSolverBody &solverBodyB = iB != 65535 ? offsetSolverBodies[iB] : staticBody;
+
+						if(k==0) {
+							
+						}
+						else {
+							btSolverConstraint* constraintRow = (btSolverConstraint*) pfxGetContactId1(pair);
+							int numRows = pfxGetNumConstraints(pair);
+							int i;
+							for (i=0;i<numRows;i++)
+							{
+								resolveSingleConstraintRowGeneric(solverBodyA,solverBodyB,constraintRow[i]);
+							}
+							
+						}
+					}
+				}
+			}
+
+			barrier->sync();
+		}
+
+		// Contact
+		for(uint32_t phaseId=0;phaseId<contactParallelGroup->numPhases;phaseId++) {
+			for(uint32_t batchId=0;batchId<contactParallelGroup->numBatches[phaseId];batchId++) {
+				uint32_t numPairs = contactParallelGroup->numPairs[phaseId*PFX_MAX_SOLVER_BATCHES+batchId];
+				if(batchId%numTasks == taskId && numPairs > 0) {
+					const PfxParallelBatch &batch = contactParallelBatches[phaseId*PFX_MAX_SOLVER_BATCHES+batchId];
+					for(int i=0;i<numPairs;i++) {
+						PfxConstraintPair &pair = contactPairs[batch.pairIndices[i]];
+						uint16_t iA = pfxGetRigidBodyIdA(pair);
+						uint16_t iB = pfxGetRigidBodyIdB(pair);
+
+						btPersistentManifold& contact = *(btPersistentManifold*)pfxGetConstraintId1(pair);
+
+
+						PfxSolverBody &solverBodyA = offsetSolverBodies[iA];
+						PfxSolverBody &solverBodyB = offsetSolverBodies[iB];
+						
+						for(int j=0;j<contact.getNumContacts();j++) {
+							btManifoldPoint& cp = contact.getContactPoint(j);
+							
+							if(k==0) {
+								vmVector3 rA = rotate(solverBodyA.mOrientation,btReadVector3(cp.m_localPointA));
+								vmVector3 rB = rotate(solverBodyB.mOrientation,btReadVector3(cp.m_localPointB));
+								
+								for(int k=0;k<3;k++) {
+									vmVector3 normal = btReadVector3(cp.mConstraintRow[k].mNormal);
+									float deltaImpulse = cp.mConstraintRow[k].mAccumImpulse;
+									solverBodyA.mDeltaLinearVelocity += deltaImpulse * solverBodyA.mMassInv * normal;
+									solverBodyA.mDeltaAngularVelocity += deltaImpulse * solverBodyA.mInertiaInv * cross(rA,normal);
+									solverBodyB.mDeltaLinearVelocity -= deltaImpulse * solverBodyB.mMassInv * normal;
+									solverBodyB.mDeltaAngularVelocity -= deltaImpulse * solverBodyB.mInertiaInv * cross(rB,normal);
+								}
+							}
+							else {
+								btSolveContactConstraint(
+									cp.mConstraintRow[0],
+									cp.mConstraintRow[1],
+									cp.mConstraintRow[2],
+									btReadVector3(cp.m_localPointA),
+									btReadVector3(cp.m_localPointB),
+									solverBodyA,
+									solverBodyB,
+									cp.m_combinedFriction
+									);
+							}
+						}
+					}
+				}
+			}
+
+			if (barrier)
+				barrier->sync();
+		}
+	}
+}
+
+void CustomPostSolverTask(
+	TrbState *states,
+	PfxSolverBody *solverBodies,
+	uint32_t numRigidBodies)
+{
+	for(int i=0;i<numRigidBodies;i++) {
+		TrbState &state = states[i];
+		PfxSolverBody &solverBody = solverBodies[i];
+		state.setLinearVelocity(state.getLinearVelocity()+solverBody.mDeltaLinearVelocity);
+		state.setAngularVelocity(state.getAngularVelocity()+solverBody.mDeltaAngularVelocity);
+	}
+}
+
+void*	SolverlsMemoryFunc()
+{
+	//don't create local store memory, just return 0
+	return 0;
+}
+
+
+static SIMD_FORCE_INLINE
+void pfxGetPlaneSpace(const vmVector3& n, vmVector3& p, vmVector3& q)
+{
+	if(fabsf(n[2]) > 0.707f) {
+		// choose p in y-z plane
+		float a = n[1]*n[1] + n[2]*n[2];
+		float k = 1.0f/sqrtf(a);
+		p[0] = 0;
+		p[1] = -n[2]*k;
+		p[2] = n[1]*k;
+		// set q = n x p
+		q[0] = a*k;
+		q[1] = -n[0]*p[2];
+		q[2] = n[0]*p[1];
+	}
+	else {
+		// choose p in x-y plane
+		float a = n[0]*n[0] + n[1]*n[1];
+		float k = 1.0f/sqrtf(a);
+		p[0] = -n[1]*k;
+		p[1] = n[0]*k;
+		p[2] = 0;
+		// set q = n x p
+		q[0] = -n[2]*p[1];
+		q[1] = n[2]*p[0];
+		q[2] = a*k;
+	}
+}
+
+
+
+#define PFX_CONTACT_SLOP 0.001f
+
+void btSetupContactConstraint(
+	PfxConstraintRow &constraintResponse,
+	PfxConstraintRow &constraintFriction1,
+	PfxConstraintRow &constraintFriction2,
+	float penetrationDepth,
+	float restitution,
+	float friction,
+	const vmVector3 &contactNormal,
+	const vmVector3 &contactPointA,
+	const vmVector3 &contactPointB,
+	const TrbState &stateA,
+	const TrbState &stateB,
+	PfxSolverBody &solverBodyA,
+	PfxSolverBody &solverBodyB,
+	float separateBias,
+	float timeStep
+	)
+{
+	vmVector3 rA = rotate(solverBodyA.mOrientation,contactPointA);
+	vmVector3 rB = rotate(solverBodyB.mOrientation,contactPointB);
+
+	vmMatrix3 K = vmMatrix3::scale(vmVector3(solverBodyA.mMassInv + solverBodyB.mMassInv)) - 
+			crossMatrix(rA) * solverBodyA.mInertiaInv * crossMatrix(rA) - 
+			crossMatrix(rB) * solverBodyB.mInertiaInv * crossMatrix(rB);
+
+	vmVector3 vA = stateA.getLinearVelocity() + cross(stateA.getAngularVelocity(),rA);
+	vmVector3 vB = stateB.getLinearVelocity() + cross(stateB.getAngularVelocity(),rB);
+	vmVector3 vAB = vA-vB;
+
+	vmVector3 tangent1,tangent2;
+	btPlaneSpace1(contactNormal,tangent1,tangent2);
+
+//	constraintResponse.mAccumImpulse = 0.f;
+//	constraintFriction1.mAccumImpulse = 0.f;
+//	constraintFriction2.mAccumImpulse = 0.f;
+
+	// Contact Constraint
+	{
+		vmVector3 normal = contactNormal;
+
+		float denom = dot(K*normal,normal);
+
+		constraintResponse.mRhs = -(1.0f+restitution)*dot(vAB,normal); // velocity error
+		constraintResponse.mRhs -= (separateBias * btMin(0.0f,penetrationDepth+PFX_CONTACT_SLOP)) / timeStep; // position error
+		constraintResponse.mRhs /= denom;
+		constraintResponse.mJacDiagInv = 1.0f/denom;
+		constraintResponse.mLowerLimit = 0.0f;
+		constraintResponse.mUpperLimit = SIMD_INFINITY;
+		btStoreVector3(normal,constraintResponse.mNormal);
+	}
+
+	// Friction Constraint 1
+	{
+		vmVector3 normal = tangent1;
+
+		float denom = dot(K*normal,normal);
+
+		constraintFriction1.mJacDiagInv = 1.0f/denom;
+		constraintFriction1.mRhs = -dot(vAB,normal);
+		constraintFriction1.mRhs *= constraintFriction1.mJacDiagInv;
+		constraintFriction1.mLowerLimit = 0.0f;
+		constraintFriction1.mUpperLimit = SIMD_INFINITY;
+		btStoreVector3(normal,constraintFriction1.mNormal);
+	}
+	
+	// Friction Constraint 2
+	{
+		vmVector3 normal = tangent2;
+
+		float denom = dot(K*normal,normal);
+
+		constraintFriction2.mJacDiagInv = 1.0f/denom;
+		constraintFriction2.mRhs = -dot(vAB,normal);
+		constraintFriction2.mRhs *= constraintFriction2.mJacDiagInv;
+		constraintFriction2.mLowerLimit = 0.0f;
+		constraintFriction2.mUpperLimit = SIMD_INFINITY;
+		btStoreVector3(normal,constraintFriction2.mNormal);
+	}
+}
+
+
+void CustomSetupContactConstraintsTask(
+	PfxConstraintPair *contactPairs,uint32_t numContactPairs,
+	TrbState *offsetRigStates,
+	PfxSolverBody *offsetSolverBodies,
+	uint32_t numRigidBodies,
+	float separateBias,
+	float timeStep)
+{
+	for(int i=0;i<numContactPairs;i++) {
+		PfxConstraintPair &pair = contactPairs[i];
+		if(!pfxGetActive(pair) || pfxGetNumConstraints(pair) == 0 ||
+			((pfxGetMotionMaskA(pair)&PFX_MOTION_MASK_STATIC) && (pfxGetMotionMaskB(pair)&PFX_MOTION_MASK_STATIC)) ) {
+			continue;
+		}
+
+		uint16_t iA = pfxGetRigidBodyIdA(pair);
+		uint16_t iB = pfxGetRigidBodyIdB(pair);
+
+		btPersistentManifold &contact = *(btPersistentManifold*)pfxGetConstraintId1(pair);
+
+
+		TrbState &stateA = offsetRigStates[iA];
+//		PfxRigBody &bodyA = offsetRigBodies[iA];
+		PfxSolverBody &solverBodyA = offsetSolverBodies[iA];
+
+		TrbState &stateB = offsetRigStates[iB];
+//		PfxRigBody &bodyB = offsetRigBodies[iB];
+		PfxSolverBody &solverBodyB = offsetSolverBodies[iB];
+		
+		float restitution = 0.5f * (solverBodyA.restitution + solverBodyB.restitution);
+		//if(contact.getDuration() > 1) restitution = 0.0f;
+		
+		float friction = sqrtf(solverBodyA.friction * solverBodyB.friction);
+
+		for(int j=0;j<contact.getNumContacts();j++) {
+			btManifoldPoint& cp = contact.getContactPoint(j);
+			
+			btSetupContactConstraint(
+				cp.mConstraintRow[0],
+				cp.mConstraintRow[1],
+				cp.mConstraintRow[2],
+				cp.getDistance(),
+				restitution,
+				friction,
+				btReadVector3(cp.m_normalWorldOnB),//.mConstraintRow[0].mNormal),
+				btReadVector3(cp.m_localPointA),
+				btReadVector3(cp.m_localPointB),
+				stateA,
+				stateB,
+				solverBodyA,
+				solverBodyB,
+				separateBias,
+				timeStep
+				);
+		}
+
+		//contact.setCompositeFriction(friction);
+	}
+}
+
+void	SolverThreadFunc(void* userPtr,void* lsMemory)
+{
+	btConstraintSolverIO* io = (btConstraintSolverIO*)(userPtr);//arg->io);
+	btCriticalSection* criticalsection = io->setupContactConstraints.criticalSection;
+	
+
+	//CustomCriticalSection *criticalsection = &io->m_cs;
+	switch(io->cmd) {
+
+		case PFX_CONSTRAINT_SOLVER_CMD_SOLVE_CONSTRAINTS:
+		CustomSolveConstraintsTaskParallel(
+			io->solveConstraints.contactParallelGroup,
+			io->solveConstraints.contactParallelBatches,
+			io->solveConstraints.contactPairs,
+			io->solveConstraints.numContactPairs,
+			io->solveConstraints.offsetContactManifolds,
+
+			io->solveConstraints.jointParallelGroup,
+			io->solveConstraints.jointParallelBatches,
+			io->solveConstraints.jointPairs,
+			io->solveConstraints.numJointPairs,
+
+			io->solveConstraints.offsetRigStates,
+			io->solveConstraints.offsetSolverBodies,
+			io->solveConstraints.numRigidBodies,
+			io->solveConstraints.iteration,
+
+			io->solveConstraints.taskId,
+			io->maxTasks1,
+			io->solveConstraints.barrier
+			);
+		break;
+
+		case PFX_CONSTRAINT_SOLVER_CMD_POST_SOLVER:
+			CustomPostSolverTask(	io->postSolver.states,io->postSolver.solverBodies,	io->postSolver.numRigidBodies);
+			break;
+
+
+		case PFX_CONSTRAINT_SOLVER_CMD_SETUP_CONTACT_CONSTRAINTS:
+		{
+			bool empty = false;
+			while(!empty) {
+				int start,batch;
+				
+				criticalsection->lock();
+
+				start = (int)criticalsection->getSharedParam(0);
+				batch = (int)criticalsection->getSharedParam(1);
+
+				//PFX_PRINTF("taskId %d start %d num %d\n",arg->taskId,start,batch);
+
+				// ���̃o�b�t�@���Z�b�g
+				int nextStart = start + batch;
+				int rest = btMax((int)io->setupContactConstraints.numContactPairs1 - nextStart,0);
+				int nextBatch = (rest > batch)?batch:rest;
+
+				criticalsection->setSharedParam(0,nextStart);
+                criticalsection->setSharedParam(1,nextBatch);
+
+				criticalsection->unlock();
+				
+				if(batch > 0) {
+					CustomSetupContactConstraintsTask(
+						io->setupContactConstraints.offsetContactPairs+start,batch,
+//						io->setupContactConstraints.offsetContactManifolds,
+						io->setupContactConstraints.offsetRigStates,
+//						io->setupContactConstraints.offsetRigBodies,
+						io->setupContactConstraints.offsetSolverBodies,
+						io->setupContactConstraints.numRigidBodies,
+						io->setupContactConstraints.separateBias,
+						io->setupContactConstraints.timeStep);
+				}
+				else {
+					empty = true;
+				}
+			}
+		}
+		break;
+
+		default:
+			{
+				btAssert(0);
+			}
+	}
+
+}
+
+
+void CustomSetupContactConstraintsNew(
+	PfxConstraintPair *contactPairs1,uint32_t numContactPairs,
+	btPersistentManifold *offsetContactManifolds,
+	TrbState *offsetRigStates,
+	PfxSolverBody *offsetSolverBodies,
+	uint32_t numRigidBodies,
+	float separationBias,
+	float timeStep,
+	class btThreadSupportInterface* threadSupport,
+	btCriticalSection* criticalSection,
+	btConstraintSolverIO *io 
+	)
+{
+	int maxTasks = threadSupport->getNumTasks();
+
+	int div = (int)maxTasks * 4;
+	int batch = ((int)numContactPairs + div - 1) / div;
+#ifdef __PPU__
+		BulletPE2ConstraintSolverSpursSupport* spursThread = (BulletPE2ConstraintSolverSpursSupport*) threadSupport;
+#endif
+	if (criticalSection)
+	{
+		criticalSection->setSharedParam(0,0);
+		criticalSection->setSharedParam(1,btMin(batch,64)); // batched number
+	} else
+	{
+#ifdef __PPU__
+		spursThread->setSharedParam(0,0);
+		spursThread->setSharedParam(1,btMin(batch,64)); // batched number
+#endif //__PPU__
+	}
+
+	for(int t=0;t<maxTasks;t++) {
+		io[t].cmd = PFX_CONSTRAINT_SOLVER_CMD_SETUP_CONTACT_CONSTRAINTS;
+		io[t].setupContactConstraints.offsetContactPairs = contactPairs1;
+		io[t].setupContactConstraints.numContactPairs1 = numContactPairs;
+		io[t].setupContactConstraints.offsetRigStates = offsetRigStates;
+		io[t].setupContactConstraints.offsetSolverBodies = offsetSolverBodies;
+		io[t].setupContactConstraints.numRigidBodies = numRigidBodies;
+		io[t].setupContactConstraints.separateBias = separationBias;
+		io[t].setupContactConstraints.timeStep = timeStep;
+		io[t].setupContactConstraints.criticalSection = criticalSection;
+		io[t].maxTasks1 = maxTasks;
+#ifdef __PPU__
+		io[t].barrierAddr2 = (unsigned int)spursThread->getBarrierAddress();
+		io[t].criticalsectionAddr2 = (unsigned int)spursThread->getCriticalSectionAddress();
+#endif
+	
+
+//#define SEQUENTIAL_SETUP
+#ifdef SEQUENTIAL_SETUP
+		CustomSetupContactConstraintsTask(contactPairs1,numContactPairs,offsetRigStates,offsetSolverBodies,numRigidBodies,separationBias,timeStep);
+#else
+		threadSupport->sendRequest(1,(ppu_address_t)&io[t],t);
+#endif
+
+	}
+#ifndef SEQUENTIAL_SETUP
+	unsigned int arg0,arg1;
+	for(int t=0;t<maxTasks;t++) {
+		arg0 = t;
+		threadSupport->waitForResponse(&arg0,&arg1);
+	}
+#endif //SEQUENTIAL_SETUP
+
+}
+
+
+void CustomSplitConstraints(
+	PfxConstraintPair *pairs,uint32_t numPairs,
+	PfxParallelGroup &group,PfxParallelBatch *batches,
+	uint32_t numTasks,
+	uint32_t numRigidBodies,
+	void *poolBuff,
+	uint32_t poolBytes
+	)
+{
+	HeapManager pool((unsigned char*)poolBuff,poolBytes);
+
+	// �X�e�[�g�`�F�b�N�p�r�b�g�t���O�e�[�u��
+	int bufSize = sizeof(uint8_t)*numRigidBodies;
+	bufSize = ((bufSize+127)>>7)<<7; // 128 bytes alignment
+	uint8_t *bodyTable = (uint8_t*)pool.allocate(bufSize,HeapManager::ALIGN128);
+
+	// �y�A�`�F�b�N�p�r�b�g�t���O�e�[�u��
+	uint32_t *pairTable;
+	size_t allocSize = sizeof(uint32_t)*((numPairs+31)/32);
+	pairTable = (uint32_t*)pool.allocate(allocSize);
+	memset(pairTable,0,allocSize);
+
+	// �ڕW�Ƃ��镪����
+	uint32_t targetCount = btMax(uint32_t(PFX_MIN_SOLVER_PAIRS),btMin(numPairs / (numTasks*2),uint32_t(PFX_MAX_SOLVER_PAIRS)));
+	uint32_t startIndex = 0;
+	
+	uint32_t phaseId;
+	uint32_t batchId;
+	uint32_t totalCount=0;
+	
+	uint32_t maxBatches = btMin(numTasks,uint32_t(PFX_MAX_SOLVER_BATCHES));
+	
+	for(phaseId=0;phaseId<PFX_MAX_SOLVER_PHASES&&totalCount<numPairs;phaseId++) {
+		bool startIndexCheck = true;
+		
+		group.numBatches[phaseId] = 0;
+		
+		uint32_t i = startIndex;
+		
+        // �`�F�b�N�p�r�b�g�t���O�e�[�u�����N���A
+		memset(bodyTable,0xff,bufSize);
+		
+		for(batchId=0;i<numPairs&&totalCount<numPairs&&batchId<maxBatches;batchId++) {
+			uint32_t pairCount=0;
+			
+			PfxParallelBatch &batch = batches[phaseId*PFX_MAX_SOLVER_BATCHES+batchId];
+			uint32_t pairId = 0;
+			
+			for(;i<numPairs&&pairCount<targetCount;i++) {
+				uint32_t idxP = i>>5;
+				uint32_t maskP = 1L << (i & 31);
+				
+				//pair is already assigned to a phase/batch
+				if(pairTable[idxP] & maskP) {
+					continue;
+				}
+				
+				uint32_t idxA = pfxGetRigidBodyIdA(pairs[i]);
+				uint32_t idxB = pfxGetRigidBodyIdB(pairs[i]);
+
+				// �����Ƃ��A�N�e�B�u�łȂ��A�܂��͏Փ˓_���O�̃y�A�͓o�^�Ώۂ���͂���
+				if(!pfxGetActive(pairs[i]) || pfxGetNumConstraints(pairs[i]) == 0 ||
+					((pfxGetMotionMaskA(pairs[i])&PFX_MOTION_MASK_STATIC) && (pfxGetMotionMaskB(pairs[i])&PFX_MOTION_MASK_STATIC)) ) {
+					if(startIndexCheck) 
+						startIndex++;
+					//assign pair -> skip it because it has no constraints
+					pairTable[idxP] |= maskP;
+					totalCount++;
+					continue;
+				}
+				
+				// �ˑ����̃`�F�b�N
+				if( (bodyTable[idxA] != batchId && bodyTable[idxA] != 0xff) || 
+					(bodyTable[idxB] != batchId && bodyTable[idxB] != 0xff) ) {
+					startIndexCheck = false;
+					//bodies of the pair are already assigned to another batch within this phase
+					continue;
+				}
+				
+				// �ˑ�������e�[�u���ɓo�^
+				if(pfxGetMotionMaskA(pairs[i])&PFX_MOTION_MASK_DYNAMIC) 
+						bodyTable[idxA] = batchId;
+				if(pfxGetMotionMaskB(pairs[i])&PFX_MOTION_MASK_DYNAMIC) 
+						bodyTable[idxB] = batchId;
+				
+				if(startIndexCheck) 
+					startIndex++;
+				
+				pairTable[idxP] |= maskP;
+				//add the pair 'i' to the current batch
+				batch.pairIndices[pairId++] = i;
+				pairCount++;
+			}
+
+			group.numPairs[phaseId*PFX_MAX_SOLVER_BATCHES+batchId] = (uint16_t)pairId;
+			totalCount += pairCount;
+		}
+
+		group.numBatches[phaseId] = batchId;
+	}
+
+	group.numPhases = phaseId;
+
+	pool.clear();
+}
+
+
+
+void CustomSolveConstraintsParallel(
+	PfxConstraintPair *contactPairs,uint32_t numContactPairs,
+	
+	PfxConstraintPair *jointPairs,uint32_t numJointPairs,
+
+	TrbState *offsetRigStates,
+	PfxSolverBody *offsetSolverBodies,
+	uint32_t numRigidBodies,
+	struct btConstraintSolverIO* io,
+	class btThreadSupportInterface* threadSupport,
+	int iteration,
+	void* poolBuf,
+	int poolBytes,
+	class btBarrier* barrier)
+	{
+
+	int maxTasks = threadSupport->getNumTasks();
+//	config.taskManager->setTaskEntry(PFX_SOLVER_ENTRY);
+
+	HeapManager pool((unsigned char*)poolBuf,poolBytes);
+
+	{
+		PfxParallelGroup *cgroup = (PfxParallelGroup*)pool.allocate(sizeof(PfxParallelGroup));
+		PfxParallelBatch *cbatches = (PfxParallelBatch*)pool.allocate(sizeof(PfxParallelBatch)*(PFX_MAX_SOLVER_PHASES*PFX_MAX_SOLVER_BATCHES),128);
+		PfxParallelGroup *jgroup = (PfxParallelGroup*)pool.allocate(sizeof(PfxParallelGroup));
+		PfxParallelBatch *jbatches = (PfxParallelBatch*)pool.allocate(sizeof(PfxParallelBatch)*(PFX_MAX_SOLVER_PHASES*PFX_MAX_SOLVER_BATCHES),128);
+		
+		uint32_t tmpBytes = poolBytes - 2 * (sizeof(PfxParallelGroup) + sizeof(PfxParallelBatch)*(PFX_MAX_SOLVER_PHASES*PFX_MAX_SOLVER_BATCHES) + 128);
+		void *tmpBuff = pool.allocate(tmpBytes);
+		
+		CustomSplitConstraints(contactPairs,numContactPairs,*cgroup,cbatches,maxTasks,numRigidBodies,tmpBuff,tmpBytes);
+		CustomSplitConstraints(jointPairs,numJointPairs,*jgroup,jbatches,maxTasks,numRigidBodies,tmpBuff,tmpBytes);
+
+//#define SOLVE_SEQUENTIAL
+#ifdef SOLVE_SEQUENTIAL
+		CustomSolveConstraintsTask(
+			io->solveConstraints.contactParallelGroup,
+			io->solveConstraints.contactParallelBatches,
+			io->solveConstraints.contactPairs,
+			io->solveConstraints.numContactPairs,
+			io->solveConstraints.offsetContactManifolds,
+
+			io->solveConstraints.jointParallelGroup,
+			io->solveConstraints.jointParallelBatches,
+			io->solveConstraints.jointPairs,
+			io->solveConstraints.numJointPairs,
+			io->solveConstraints.offsetJoints,
+
+			io->solveConstraints.offsetRigStates,
+			io->solveConstraints.offsetSolverBodies,
+			io->solveConstraints.numRigidBodies,
+			io->solveConstraints.iteration,0,1,0);//arg->taskId,1,0);//,arg->maxTasks,arg->barrier);
+#else
+		for(int t=0;t<maxTasks;t++) {
+			io[t].cmd = PFX_CONSTRAINT_SOLVER_CMD_SOLVE_CONSTRAINTS;
+			io[t].solveConstraints.contactParallelGroup = cgroup;
+			io[t].solveConstraints.contactParallelBatches = cbatches;
+			io[t].solveConstraints.contactPairs = contactPairs;
+			io[t].solveConstraints.numContactPairs = numContactPairs;
+//			io[t].solveConstraints.offsetContactManifolds = offsetContactManifolds;
+			io[t].solveConstraints.jointParallelGroup = jgroup;
+			io[t].solveConstraints.jointParallelBatches = jbatches;
+			io[t].solveConstraints.jointPairs = jointPairs;
+			io[t].solveConstraints.numJointPairs = numJointPairs;
+			io[t].solveConstraints.offsetRigStates = offsetRigStates;
+			io[t].solveConstraints.offsetSolverBodies = offsetSolverBodies;
+			io[t].solveConstraints.numRigidBodies = numRigidBodies;
+			io[t].solveConstraints.iteration = iteration;
+			io[t].solveConstraints.taskId = t;
+			io[t].solveConstraints.barrier = barrier;
+
+		io[t].maxTasks1 = maxTasks;
+#ifdef __PPU__
+		BulletPE2ConstraintSolverSpursSupport* spursThread = (BulletPE2ConstraintSolverSpursSupport*) threadSupport;
+		io[t].barrierAddr2 = (unsigned int) spursThread->getBarrierAddress();
+		io[t].criticalsectionAddr2 = (unsigned int)spursThread->getCriticalSectionAddress();
+#endif
+
+			threadSupport->sendRequest(1,(ppu_address_t)&io[t],t);
+		}
+
+		unsigned int arg0,arg1;
+		for(int t=0;t<maxTasks;t++) {
+			arg0 = t;
+			threadSupport->waitForResponse(&arg0,&arg1);
+		}
+#endif
+		pool.clear();
+	}
+
+	{
+		int batch = ((int)numRigidBodies + maxTasks - 1) / maxTasks;
+		int rest = (int)numRigidBodies;
+		int start = 0;
+
+		for(int t=0;t<maxTasks;t++) {
+			int num = (rest - batch ) > 0 ? batch : rest;
+			io[t].cmd = PFX_CONSTRAINT_SOLVER_CMD_POST_SOLVER;
+			io[t].postSolver.states = offsetRigStates + start;
+			io[t].postSolver.solverBodies = offsetSolverBodies + start;
+			io[t].postSolver.numRigidBodies = (uint32_t)num;
+		io[t].maxTasks1 = maxTasks;
+#ifdef __PPU__
+		BulletPE2ConstraintSolverSpursSupport* spursThread = (BulletPE2ConstraintSolverSpursSupport*) threadSupport;
+		io[t].barrierAddr2 = (unsigned int)spursThread->getBarrierAddress();
+		io[t].criticalsectionAddr2 = (unsigned int)spursThread->getCriticalSectionAddress();
+#endif
+
+#ifdef SOLVE_SEQUENTIAL
+			CustomPostSolverTask(	io[t].postSolver.states,io[t].postSolver.solverBodies,	io[t].postSolver.numRigidBodies);
+#else
+			threadSupport->sendRequest(1,(ppu_address_t)&io[t],t);
+#endif
+			rest -= num;
+			start += num;
+		}
+
+		unsigned int arg0,arg1;
+		for(int t=0;t<maxTasks;t++) {
+#ifndef SOLVE_SEQUENTIAL
+			arg0 = t;
+			threadSupport->waitForResponse(&arg0,&arg1);
+#endif
+		}
+	}
+
+}
+
+
+
+void BPE_customConstraintSolverSequentialNew(unsigned int new_num, PfxBroadphasePair *new_pairs1 ,
+									  TrbState* states,int numRigidBodies, 
+									  struct PfxSolverBody* solverBodies, 
+									  btPersistentManifold* contacts,
+									  PfxConstraintPair* jointPairs, unsigned int numJoints,
+									  float separateBias,
+									  float timeStep,
+									  int iteration,
+									  btThreadSupportInterface* solverThreadSupport,
+									  btCriticalSection* criticalSection,
+									  struct btConstraintSolverIO* solverIO,
+									  btBarrier* barrier
+									  )
+{
+
+	{
+		BT_PROFILE("pfxSetupConstraints");
+
+		for(int i=0;i<numJoints;i++) {
+			// ���̍X�V
+			PfxConstraintPair &pair = jointPairs[i];
+			int idA = pfxGetRigidBodyIdA(pair);
+
+			if (idA != 65535)
+			{
+				pfxSetMotionMaskA(pair,states[pfxGetRigidBodyIdA(pair)].getMotionMask());
+			}
+			else
+			{
+				pfxSetMotionMaskA(pair,PFX_MOTION_MASK_STATIC);
+			}
+			int idB = pfxGetRigidBodyIdB(pair);
+			if (idB!= 65535)
+			{
+				pfxSetMotionMaskB(pair,states[pfxGetRigidBodyIdB(pair)].getMotionMask());
+			} else
+			{
+				pfxSetMotionMaskB(pair,PFX_MOTION_MASK_STATIC);
+			}
+		}
+
+//		CustomSetupJointConstraintsSeq(			jointPairs,numJoints,joints,			states,			solverBodies,			numRigidBodies,			timeStep);
+
+#ifdef SEQUENTIAL_SETUP
+		CustomSetupContactConstraintsSeqNew(
+			(PfxConstraintPair*)new_pairs1,new_num,contacts,
+			states,
+			solverBodies,
+			numRigidBodies,
+			separateBias,
+			timeStep);
+#else
+		CustomSetupContactConstraintsNew(
+			(PfxConstraintPair*)new_pairs1,new_num,contacts,
+			states,
+			solverBodies,
+			numRigidBodies,
+			separateBias,
+			timeStep,
+			solverThreadSupport,
+			criticalSection,solverIO
+			);
+
+#endif //SEQUENTIAL_SETUP
+
+	}
+	{
+		BT_PROFILE("pfxSolveConstraints");
+
+//#define SEQUENTIAL
+#ifdef SEQUENTIAL
+		CustomSolveConstraintsSeq(
+			(PfxConstraintPair*)new_pairs1,new_num,contacts,
+			jointPairs,numJoints,
+			states,
+			solverBodies,
+			numRigidBodies,
+			separateBias,
+			timeStep,
+			iteration);
+#else //SEQUENTIAL
+		CustomSolveConstraintsParallel(
+			(PfxConstraintPair*)new_pairs1,new_num,
+			jointPairs,numJoints,
+			states,
+			solverBodies,
+			numRigidBodies,
+			solverIO, solverThreadSupport,
+			iteration,
+			tmp_buff,
+			TMP_BUFF_BYTES,
+			barrier
+			);
+
+#endif //SEQUENTIAL
+	}
+
+}
+
+
+struct	btParallelSolverMemoryCache
+{
+	btAlignedObjectArray<TrbState>	m_mystates;
+	btAlignedObjectArray<PfxSolverBody>  m_mysolverbodies;
+	btAlignedObjectArray<PfxBroadphasePair> m_mypairs;
+	btAlignedObjectArray<PfxConstraintPair> m_jointPairs;
+	
+};
+
+btConstraintSolverIO* createSolverIO(int numThreads)
+{
+	return new btConstraintSolverIO[numThreads];
+}
+
+btParallelConstraintSolver::btParallelConstraintSolver(btThreadSupportInterface* solverThreadSupport)
+{
+	
+	m_solverThreadSupport = solverThreadSupport;//createSolverThreadSupport(maxNumThreads);
+	m_solverIO = createSolverIO(m_solverThreadSupport->getNumTasks());
+
+	m_barrier = m_solverThreadSupport->createBarrier();
+	m_criticalSection = m_solverThreadSupport->createCriticalSection();
+
+	m_memoryCache = new btParallelSolverMemoryCache();
 }
 	
 btParallelConstraintSolver::~btParallelConstraintSolver()
 {
-	//exit MiniCL
-
+	delete m_memoryCache;
 }
 
-	
-btScalar btParallelConstraintSolver::solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btStackAlloc* stackAlloc)
+
+
+btScalar btParallelConstraintSolver::solveGroup(btCollisionObject** bodies1,int numRigidBodies,btPersistentManifold** manifoldPtr,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal, btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc,btDispatcher* dispatcher)
 {
-	{
-			int i;
-			btPersistentManifold* manifold = 0;
-//			btCollisionObject* colObj0=0,*colObj1=0;
-
-
-			for (i=0;i<numManifolds;i++)
-			{
-				manifold = manifoldPtr[i];
-				convertContact(manifold,infoGlobal);
-			}
+	
+	int sz = sizeof(PfxSolverBody);
+	int sz2 = sizeof(vmVector3);
+	int sz3 = sizeof(vmMatrix3);
+	int sz4 = sizeof(vmQuat);
+	int sz5 = sizeof(PfxConstraintRow);
+	int sz6 = sizeof(btSolverConstraint);
+	int sz7 = sizeof(TrbState);
+	
 		
-	}
+	m_memoryCache->m_mysolverbodies.resize(numRigidBodies);
+	m_memoryCache->m_mystates.resize(numRigidBodies);
 
-	btContactSolverInfo info = infoGlobal;
-
-
-
-	int numConstraintPool = m_tmpSolverContactConstraintPool.size();
-	int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size();
-
-	///@todo: use stack allocator for such temporarily memory, same for solver bodies/constraints
-	m_orderTmpConstraintPool.resize(numConstraintPool);
-	m_orderFrictionConstraintPool.resize(numFrictionPool);
 	{
-		int i;
-		for (i=0;i<numConstraintPool;i++)
+			BT_PROFILE("create states and solver bodies");
+	for (int i=0;i<numRigidBodies;i++)
+	{
+		btCollisionObject* obj = bodies1[i];
+		obj->setCompanionId(i);
+
+		PfxSolverBody& solverBody = m_memoryCache->m_mysolverbodies[i];
+		btRigidBody* rb = btRigidBody::upcast(obj);
+		TrbState& state = m_memoryCache->m_mystates[i];
+	
+		state.reset();
+		const btQuaternion& orgOri = obj->getWorldTransform().getRotation();
+		vmQuat orn(orgOri.getX(),orgOri.getY(),orgOri.getZ(),orgOri.getW());
+		state.setPosition((vmVector3&) obj->getWorldTransform().getOrigin());
+		state.setOrientation(orn);
+		state.setPosition(state.getPosition());
+		state.setRigidBodyId(i);
+		state.setAngularDamping(0);
+		state.setLinearDamping(0);
+		
+		
+		solverBody.mOrientation = state.getOrientation();
+		solverBody.mDeltaLinearVelocity = vmVector3(0.0f);
+		solverBody.mDeltaAngularVelocity = vmVector3(0.0f);
+		solverBody.friction = obj->getFriction();
+		solverBody.restitution = obj->getRestitution();
+		
+		state.resetSleepCount();
+		
+		//if(state.getMotionMask()&PFX_MOTION_MASK_DYNAMIC) {
+		if (rb && (rb->getInvMass()>0.f))
 		{
-			m_orderTmpConstraintPool[i] = i;
+			state.setAngularVelocity(vmVector3(rb->getAngularVelocity().getX(),rb->getAngularVelocity().getY(),rb->getAngularVelocity().getZ()));
+			state.setLinearVelocity(vmVector3(rb->getLinearVelocity().getX(),rb->getLinearVelocity().getY(),rb->getLinearVelocity().getZ()));
+	
+			state.setMotionType(PfxMotionTypeActive);
+			vmMatrix3 ori(solverBody.mOrientation);
+			vmMatrix3 localInvInertia = vmMatrix3::identity();
+			localInvInertia.setCol(0,vmVector3(rb->getInvInertiaDiagLocal().getX(),0,0));
+			localInvInertia.setCol(1,vmVector3(0, rb->getInvInertiaDiagLocal().getY(),0));
+			localInvInertia.setCol(2,vmVector3(0,0, rb->getInvInertiaDiagLocal().getZ()));
+
+			solverBody.mMassInv = rb->getInvMass();
+			solverBody.mInertiaInv = ori * localInvInertia * transpose(ori);
+		} else
+		{
+			state.setAngularVelocity(vmVector3(0));
+			state.setLinearVelocity(vmVector3(0));
+		
+			state.setMotionType(PfxMotionTypeFixed);
+			m_memoryCache->m_mysolverbodies[i].mMassInv = 0.f;
+			m_memoryCache->m_mysolverbodies[i].mInertiaInv = vmMatrix3(0.0f);
 		}
-		for (i=0;i<numFrictionPool;i++)
+
+	}
+	}
+
+
+
+	int totalPoints = 0;
+#ifndef USE_C_ARRAYS
+	m_memoryCache->m_mypairs.resize(numManifolds);
+	m_memoryCache->m_jointPairs.resize(numConstraints);
+#endif//USE_C_ARRAYS
+
+	int actualNumManifolds= 0;
+	{
+		BT_PROFILE("convert manifolds");
+		for (int i1=0;i1<numManifolds;i1++)
 		{
-			m_orderFrictionConstraintPool[i] = i;
+			if (manifoldPtr[i1]->getNumContacts()>0)
+			{
+				btPersistentManifold* m = manifoldPtr[i1];
+				btCollisionObject* obA = (btCollisionObject*)m->getBody0();
+				btCollisionObject* obB = (btCollisionObject*)m->getBody1();
+				bool obAisActive = !obA->isStaticOrKinematicObject() && obA->isActive();
+				bool obBisActive = !obB->isStaticOrKinematicObject() && obB->isActive();
+
+				if (!obAisActive && !obBisActive)
+					continue;
+
+
+				//int contactId = i1;//actualNumManifolds;
+				
+				PfxBroadphasePair& pair = m_memoryCache->m_mypairs[actualNumManifolds];
+				//init those
+				float compFric = obA->getFriction()*obB->getFriction();//@todo
+				int idA = obA->getCompanionId();
+				int idB = obB->getCompanionId();
+				
+				m->m_companionIdA = idA;
+				m->m_companionIdB = idB;
+				
+				
+			//	if ((mysolverbodies[idA].mMassInv!=0)&&(mysolverbodies[idB].mMassInv!=0))
+			//		continue;
+				int numPosPoints=0;
+				for (int p=0;p<m->getNumContacts();p++)
+				{
+					btManifoldPoint& pt = m->getContactPoint(p);
+					float dist = pt.getDistance();
+					//if (dist<0.001)
+						numPosPoints++;
+				}
+
+				
+				numPosPoints = numPosPoints;
+				totalPoints+=numPosPoints;
+				pfxSetRigidBodyIdA(pair,idA);
+				pfxSetRigidBodyIdB(pair,idB);
+				pfxSetMotionMaskA(pair,m_memoryCache->m_mystates[idA].getMotionMask());
+				pfxSetMotionMaskB(pair,m_memoryCache->m_mystates[idB].getMotionMask());
+				pfxSetActive(pair,numPosPoints>0);
+				
+				pfxSetBroadphaseFlag(pair,0);
+				pfxSetContactId(pair,(uint64_t)m);//contactId);
+				pfxSetNumConstraints(pair,numPosPoints);//manifoldPtr[i]->getNumContacts());
+				actualNumManifolds++;
+			}
+
 		}
 	}
 
+	PfxConstraintPair* jointPairs=0;
+	jointPairs = numConstraints? &m_memoryCache->m_jointPairs[0]:0;
+	int actualNumJoints=0;
+
+
+	//if (1)
+	{
+		BT_PROFILE("convert constraints");
+		{
+
+			int totalNumRows = 0;
+			int i;
+			
+			m_tmpConstraintSizesPool.resize(numConstraints);
+			//calculate the total number of contraint rows
+			for (i=0;i<numConstraints;i++)
+			{
+				btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
+				constraints[i]->getInfo1(&info1);
+				totalNumRows += info1.m_numConstraintRows;
+			}
+			m_tmpSolverNonContactConstraintPool.resize(totalNumRows);
+
+			
+			///setup the btSolverConstraints
+			int currentRow = 0;
+
+			for (i=0;i<numConstraints;i++)
+			{
+				const btTypedConstraint::btConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
+				
+				if (info1.m_numConstraintRows)
+				{
+					btAssert(currentRow<totalNumRows);
+
+					btSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[currentRow];
+					btTypedConstraint* constraint = constraints[i];
+
+
+
+					btRigidBody& rbA = constraint->getRigidBodyA();
+					btRigidBody& rbB = constraint->getRigidBodyB();
+
+					
+					int j;
+					for ( j=0;j<info1.m_numConstraintRows;j++)
+					{
+						memset(&currentConstraintRow[j],0,sizeof(btSolverConstraint));
+						currentConstraintRow[j].m_lowerLimit = -FLT_MAX;
+						currentConstraintRow[j].m_upperLimit = FLT_MAX;
+						currentConstraintRow[j].m_appliedImpulse = 0.f;
+						currentConstraintRow[j].m_appliedPushImpulse = 0.f;
+						currentConstraintRow[j].m_solverBodyA = &rbA;
+						currentConstraintRow[j].m_solverBodyB = &rbB;
+					}
+
+					rbA.internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
+					rbA.internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
+					rbB.internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f);
+					rbB.internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f);
+
+
+
+					btTypedConstraint::btConstraintInfo2 info2;
+					info2.fps = 1.f/infoGlobal.m_timeStep;
+					info2.erp = infoGlobal.m_erp;
+					info2.m_J1linearAxis = currentConstraintRow->m_contactNormal;
+					info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal;
+					info2.m_J2linearAxis = 0;
+					info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal;
+					info2.rowskip = sizeof(btSolverConstraint)/sizeof(btScalar);//check this
+					///the size of btSolverConstraint needs be a multiple of btScalar
+					btAssert(info2.rowskip*sizeof(btScalar)== sizeof(btSolverConstraint));
+					info2.m_constraintError = &currentConstraintRow->m_rhs;
+					currentConstraintRow->m_cfm = infoGlobal.m_globalCfm;
+					info2.cfm = &currentConstraintRow->m_cfm;
+					info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;
+					info2.m_upperLimit = &currentConstraintRow->m_upperLimit;
+					info2.m_numIterations = infoGlobal.m_numIterations;
+					constraints[i]->getInfo2(&info2);
+
+					int idA = constraint->getRigidBodyA().getCompanionId();
+					int idB = constraint->getRigidBodyB().getCompanionId();
+				
+
+					///finalize the constraint setup
+					for ( j=0;j<info1.m_numConstraintRows;j++)
+					{
+						btSolverConstraint& solverConstraint = currentConstraintRow[j];
+						solverConstraint.m_originalContactPoint = constraint;
+
+						solverConstraint.m_companionIdA = idA;
+						solverConstraint.m_companionIdB = idB;
+
+						{
+							const btVector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal;
+							solverConstraint.m_angularComponentA = constraint->getRigidBodyA().getInvInertiaTensorWorld()*ftorqueAxis1*constraint->getRigidBodyA().getAngularFactor();
+						}
+						{
+							const btVector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal;
+							solverConstraint.m_angularComponentB = constraint->getRigidBodyB().getInvInertiaTensorWorld()*ftorqueAxis2*constraint->getRigidBodyB().getAngularFactor();
+						}
+
+						{
+							btVector3 iMJlA = solverConstraint.m_contactNormal*rbA.getInvMass();
+							btVector3 iMJaA = rbA.getInvInertiaTensorWorld()*solverConstraint.m_relpos1CrossNormal;
+							btVector3 iMJlB = solverConstraint.m_contactNormal*rbB.getInvMass();//sign of normal?
+							btVector3 iMJaB = rbB.getInvInertiaTensorWorld()*solverConstraint.m_relpos2CrossNormal;
+
+							btScalar sum = iMJlA.dot(solverConstraint.m_contactNormal);
+							sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
+							sum += iMJlB.dot(solverConstraint.m_contactNormal);
+							sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
+
+							solverConstraint.m_jacDiagABInv = btScalar(1.)/sum;
+						}
+
+
+						///fix rhs
+						///todo: add force/torque accelerators
+						{
+							btScalar rel_vel;
+							btScalar vel1Dotn = solverConstraint.m_contactNormal.dot(rbA.getLinearVelocity()) + solverConstraint.m_relpos1CrossNormal.dot(rbA.getAngularVelocity());
+							btScalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rbB.getLinearVelocity()) + solverConstraint.m_relpos2CrossNormal.dot(rbB.getAngularVelocity());
+
+							rel_vel = vel1Dotn+vel2Dotn;
+
+							btScalar restitution = 0.f;
+							btScalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2
+							btScalar	velocityError = restitution - rel_vel;// * damping;
+							btScalar	penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv;
+							btScalar	velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv;
+							solverConstraint.m_rhs = penetrationImpulse+velocityImpulse;
+							solverConstraint.m_appliedImpulse = 0.f;
+
+						}
+					}
+
+					PfxConstraintPair& pair = jointPairs[actualNumJoints];
+					
+					int numConstraintRows= info1.m_numConstraintRows;
+					pfxSetNumConstraints(pair,numConstraintRows);
+					
+
+
+					pfxSetRigidBodyIdA(pair,idA);
+					pfxSetRigidBodyIdB(pair,idB);
+					//is this needed?
+					pfxSetMotionMaskA(pair,m_memoryCache->m_mystates[idA].getMotionMask());
+					pfxSetMotionMaskB(pair,m_memoryCache->m_mystates[idB].getMotionMask());
+
+					pfxSetActive(pair,true);
+					pfxSetContactId(pair,(uint64_t)currentConstraintRow);//contactId);
+					actualNumJoints++;
+
+
+				}
+				currentRow+=m_tmpConstraintSizesPool[i].m_numConstraintRows;
+			}
+		}
+	}
+
+
+	
+	float separateBias=0.1;//info.m_erp;//or m_erp2?
+	float timeStep=infoGlobal.m_timeStep;
+	int iteration=infoGlobal.m_numIterations;
+
+	//create a pair for each constraints, copy over info etc
+	
+	
+
+
+	
+	{
+		BT_PROFILE("compute num contacts");
+		int totalContacts =0;
+
+		for (int i=0;i<actualNumManifolds;i++)
+		{
+			PfxConstraintPair* pair = &m_memoryCache->m_mypairs[i];
+			totalContacts += pfxGetNumConstraints(*pair);
+		}
+		//printf("numManifolds = %d\n",numManifolds);
+		//printf("totalContacts=%d\n",totalContacts);
+	}
+	
+
+
+//	printf("actualNumManifolds=%d\n",actualNumManifolds);
+	{
+		BT_PROFILE("BPE_customConstraintSolverSequentialNew");
+		if (numRigidBodies>0 && (actualNumManifolds+actualNumJoints)>0)
+		{
+//			PFX_PRINTF("num points = %d\n",totalPoints);
+//			PFX_PRINTF("num points PFX = %d\n",total);
+			
+			BPE_customConstraintSolverSequentialNew(
+				actualNumManifolds,
+				&m_memoryCache->m_mypairs[0],
+				&m_memoryCache->m_mystates[0],numRigidBodies,
+				&m_memoryCache->m_mysolverbodies[0],
+				0,//manifoldArray,
+				jointPairs,actualNumJoints,separateBias,timeStep,iteration,
+				m_solverThreadSupport,m_criticalSection,m_solverIO,m_barrier);
+		}
+	}
+
+	//copy results back to bodies
+	{
+		BT_PROFILE("copy back");
+		for (int i=0;i<numRigidBodies;i++)
+		{
+			btCollisionObject* obj = bodies1[i];
+			btRigidBody* rb = btRigidBody::upcast(obj);
+			TrbState& state = m_memoryCache->m_mystates[i];
+			if (rb && (rb->getInvMass()>0.f))
+			{
+				rb->setLinearVelocity(btVector3(state.getLinearVelocity().getX(),state.getLinearVelocity().getY(),state.getLinearVelocity().getZ()));
+				rb->setAngularVelocity(btVector3(state.getAngularVelocity().getX(),state.getAngularVelocity().getY(),state.getAngularVelocity().getZ()));
+			}
+		}
+	}
+
+
 	return 0.f;
 }
-
diff --git a/src/BulletMultiThreaded/btParallelConstraintSolver.h b/src/BulletMultiThreaded/btParallelConstraintSolver.h
index c347f96f5..2e8fbaf13 100644
--- a/src/BulletMultiThreaded/btParallelConstraintSolver.h
+++ b/src/BulletMultiThreaded/btParallelConstraintSolver.h
@@ -18,22 +18,238 @@ subject to the following restrictions:
 #define __BT_PARALLEL_CONSTRAINT_SOLVER_H
 
 #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h"
-									      
+
+
+
+
+#include "LinearMath/btScalar.h"
+#include "PlatformDefinitions.h"
+
+
+#define PFX_MAX_SOLVER_PHASES 64
+#define PFX_MAX_SOLVER_BATCHES 16
+#define PFX_MAX_SOLVER_PAIRS  128
+#define PFX_MIN_SOLVER_PAIRS  16
+
+#ifdef __CELLOS_LV2__
+ATTRIBUTE_ALIGNED128(struct) PfxParallelBatch {
+#else
+ATTRIBUTE_ALIGNED16(struct) PfxParallelBatch {
+#endif
+	uint16_t pairIndices[PFX_MAX_SOLVER_PAIRS];
+};
+
+#ifdef __CELLOS_LV2__
+ATTRIBUTE_ALIGNED128(struct) PfxParallelGroup {
+#else
+ATTRIBUTE_ALIGNED16(struct) PfxParallelGroup {
+#endif
+	uint16_t numPhases;
+	uint16_t numBatches[PFX_MAX_SOLVER_PHASES];
+	uint16_t numPairs[PFX_MAX_SOLVER_PHASES*PFX_MAX_SOLVER_BATCHES];
+};
+
+
+
+ATTRIBUTE_ALIGNED16(struct) PfxSortData16 {
+	union {
+		uint8_t   i8data[16];
+		uint16_t  i16data[8];
+		uint32_t  i32data[4];
+#ifdef __SPU__
+		vec_uint4 vdata;
+#endif
+	};
+
+#ifdef __SPU__
+	void set8(int elem,uint8_t data)   {vdata=(vec_uint4)spu_insert(data,(vec_uchar16)vdata,elem);}
+	void set16(int elem,uint16_t data) {vdata=(vec_uint4)spu_insert(data,(vec_ushort8)vdata,elem);}
+	void set32(int elem,uint32_t data) {vdata=(vec_uint4)spu_insert(data,(vec_uint4)vdata,elem);}
+	uint8_t get8(int elem)   const {return spu_extract((vec_uchar16)vdata,elem);}
+	uint16_t get16(int elem) const {return spu_extract((vec_ushort8)vdata,elem);}
+	uint32_t get32(int elem) const {return spu_extract((vec_uint4)vdata,elem);}
+#else
+	void set8(int elem,uint8_t data)   {i8data[elem] = data;}
+	void set16(int elem,uint16_t data) {i16data[elem] = data;}
+	void set32(int elem,uint32_t data) {i32data[elem] = data;}
+	uint8_t get8(int elem)   const {return i8data[elem];}
+	uint16_t get16(int elem) const {return i16data[elem];}
+	uint32_t get32(int elem) const {return i32data[elem];}
+#endif
+};
+
+typedef PfxSortData16 PfxConstraintPair;
+
+
+//J	PfxBroadphasePair�Ƌ���
+
+SIMD_FORCE_INLINE void pfxSetConstraintId(PfxConstraintPair &pair,uint32_t i)	{pair.set32(2,i);}
+SIMD_FORCE_INLINE void pfxSetNumConstraints(PfxConstraintPair &pair,uint8_t n)	{pair.set8(7,n);}
+
+SIMD_FORCE_INLINE uint32_t pfxGetConstraintId1(const PfxConstraintPair &pair)	{return pair.get32(2);}
+SIMD_FORCE_INLINE uint8_t  pfxGetNumConstraints(const PfxConstraintPair &pair)	{return pair.get8(7);}
+
+typedef PfxSortData16 PfxBroadphasePair;
+
+SIMD_FORCE_INLINE void pfxSetRigidBodyIdA(PfxBroadphasePair &pair,uint16_t i)	{pair.set16(0,i);}
+SIMD_FORCE_INLINE void pfxSetRigidBodyIdB(PfxBroadphasePair &pair,uint16_t i)	{pair.set16(1,i);}
+SIMD_FORCE_INLINE void pfxSetMotionMaskA(PfxBroadphasePair &pair,uint8_t i)		{pair.set8(4,i);}
+SIMD_FORCE_INLINE void pfxSetMotionMaskB(PfxBroadphasePair &pair,uint8_t i)		{pair.set8(5,i);}
+SIMD_FORCE_INLINE void pfxSetBroadphaseFlag(PfxBroadphasePair &pair,uint8_t f)	{pair.set8(6,(pair.get8(6)&0xf0)|(f&0x0f));}
+SIMD_FORCE_INLINE void pfxSetActive(PfxBroadphasePair &pair,bool b)			{pair.set8(6,(pair.get8(6)&0x0f)|((b?1:0)<<4));}
+SIMD_FORCE_INLINE void pfxSetContactId(PfxBroadphasePair &pair,uint32_t i)		{pair.set32(2,i);}
+
+SIMD_FORCE_INLINE uint16_t pfxGetRigidBodyIdA(const PfxBroadphasePair &pair)	{return pair.get16(0);}
+SIMD_FORCE_INLINE uint16_t pfxGetRigidBodyIdB(const PfxBroadphasePair &pair)	{return pair.get16(1);}
+SIMD_FORCE_INLINE uint8_t  pfxGetMotionMaskA(const PfxBroadphasePair &pair)		{return pair.get8(4);}
+SIMD_FORCE_INLINE uint8_t  pfxGetMotionMaskB(const PfxBroadphasePair &pair)		{return pair.get8(5);}
+SIMD_FORCE_INLINE uint8_t  pfxGetBroadphaseFlag(const PfxBroadphasePair &pair)	{return pair.get8(6)&0x0f;}
+SIMD_FORCE_INLINE bool     pfxGetActive(const PfxBroadphasePair &pair)			{return (pair.get8(6)>>4)!=0;}
+SIMD_FORCE_INLINE uint32_t pfxGetContactId1(const PfxBroadphasePair &pair)		{return pair.get32(2);}
+
+
+
+#if defined(__PPU__) || defined (__SPU__)
+ATTRIBUTE_ALIGNED128(struct) PfxSolverBody {
+#else
+ATTRIBUTE_ALIGNED16(struct) PfxSolverBody {
+#endif
+	vmVector3 mDeltaLinearVelocity;
+	vmVector3 mDeltaAngularVelocity;
+	vmMatrix3 mInertiaInv;
+	vmQuat    mOrientation;
+	float   mMassInv;
+	float   friction;
+	float   restitution;
+	float   unused;
+	float   unused2;
+	float   unused3;
+	float   unused4;
+	float   unused5;
+};
+
+
+#ifdef __PPU__
+#include "SpuDispatch/BulletPE2ConstraintSolverSpursSupport.h"
+#endif
+
+
+
+static SIMD_FORCE_INLINE vmVector3 btReadVector3(const float* p)
+{
+	vmVector3 v;
+	loadXYZ(v, p);
+	return v;
+}
+
+static SIMD_FORCE_INLINE vmQuat btReadQuat(const float* p)
+{
+	vmQuat vq;
+	loadXYZW(vq, p);
+	return vq;
+}
+
+static SIMD_FORCE_INLINE void btStoreVector3(const vmVector3 &src, float* p)
+{
+	vmVector3 v = src;
+	storeXYZ(v, p);
+}
+
+
+
+
+class btPersistentManifold;
+
+enum {
+	PFX_CONSTRAINT_SOLVER_CMD_SETUP_SOLVER_BODIES,
+	PFX_CONSTRAINT_SOLVER_CMD_SETUP_CONTACT_CONSTRAINTS,
+	PFX_CONSTRAINT_SOLVER_CMD_SETUP_JOINT_CONSTRAINTS,
+	PFX_CONSTRAINT_SOLVER_CMD_SOLVE_CONSTRAINTS,
+	PFX_CONSTRAINT_SOLVER_CMD_POST_SOLVER
+};
+
+
+struct PfxSetupContactConstraintsIO {
+	PfxConstraintPair *offsetContactPairs;
+	uint32_t numContactPairs1;
+	class TrbState *offsetRigStates;
+	struct PfxSolverBody *offsetSolverBodies;
+	uint32_t numRigidBodies;
+	float separateBias;
+	float timeStep;
+	class btCriticalSection* criticalSection;
+};
+
+
+
+struct PfxSolveConstraintsIO {
+	PfxParallelGroup *contactParallelGroup;
+	PfxParallelBatch *contactParallelBatches;
+	PfxConstraintPair *contactPairs;
+	uint32_t numContactPairs;
+	btPersistentManifold *offsetContactManifolds;
+	PfxParallelGroup *jointParallelGroup;
+	PfxParallelBatch *jointParallelBatches;
+	PfxConstraintPair *jointPairs;
+	uint32_t numJointPairs;
+	TrbState *offsetRigStates;
+	PfxSolverBody *offsetSolverBodies;
+	uint32_t numRigidBodies;
+	uint32_t iteration;
+
+	uint32_t	taskId;
+	
+	class btBarrier* barrier;
+
+};
+
+struct PfxPostSolverIO {
+	TrbState *states;
+	PfxSolverBody *solverBodies;
+	uint32_t numRigidBodies;
+};
+
+ATTRIBUTE_ALIGNED16(struct) btConstraintSolverIO {
+	uint8_t cmd;
+	union {
+		PfxSetupContactConstraintsIO setupContactConstraints;
+		PfxSolveConstraintsIO solveConstraints;
+		PfxPostSolverIO postSolver;
+	};
+	
+	//SPU only
+	uint32_t barrierAddr2;
+	uint32_t criticalsectionAddr2;
+	uint32_t maxTasks1;
+};
+
+
+
+
+void	SolverThreadFunc(void* userPtr,void* lsMemory);
+void*	SolverlsMemoryFunc();
+///The btParallelConstraintSolver performs computations on constraint rows in parallel
+///Using the cross-platform threading it supports Windows, Linux, Mac OSX and PlayStation 3 Cell SPUs
 class btParallelConstraintSolver : public btSequentialImpulseConstraintSolver
 {
+	
 protected:
+	struct btParallelSolverMemoryCache*	m_memoryCache;
+
+	class btThreadSupportInterface*	m_solverThreadSupport;
+
+	struct btConstraintSolverIO* m_solverIO;
+	class btBarrier*			m_barrier;
+	class btCriticalSection*	m_criticalSection;
+
 
 public:
 
-	btParallelConstraintSolver();
+	btParallelConstraintSolver(class btThreadSupportInterface* solverThreadSupport);
 	
 	virtual ~btParallelConstraintSolver();
 
-	//virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc,btDispatcher* dispatcher);
-	
-	btScalar solveGroupCacheFriendlySetup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer,btStackAlloc* stackAlloc);
-
-
+	virtual btScalar solveGroup(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifold,int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btStackAlloc* stackAlloc,btDispatcher* dispatcher);
 
 };
 
diff --git a/src/BulletMultiThreaded/vectormath/scalar/cpp/boolInVec.h b/src/BulletMultiThreaded/vectormath/scalar/boolInVec.h
similarity index 100%
rename from src/BulletMultiThreaded/vectormath/scalar/cpp/boolInVec.h
rename to src/BulletMultiThreaded/vectormath/scalar/boolInVec.h
diff --git a/src/BulletMultiThreaded/vectormath/scalar/cpp/floatInVec.h b/src/BulletMultiThreaded/vectormath/scalar/floatInVec.h
similarity index 100%
rename from src/BulletMultiThreaded/vectormath/scalar/cpp/floatInVec.h
rename to src/BulletMultiThreaded/vectormath/scalar/floatInVec.h
diff --git a/src/BulletMultiThreaded/vectormath/scalar/cpp/mat_aos.h b/src/BulletMultiThreaded/vectormath/scalar/mat_aos.h
similarity index 100%
rename from src/BulletMultiThreaded/vectormath/scalar/cpp/mat_aos.h
rename to src/BulletMultiThreaded/vectormath/scalar/mat_aos.h
diff --git a/src/BulletMultiThreaded/vectormath/scalar/cpp/quat_aos.h b/src/BulletMultiThreaded/vectormath/scalar/quat_aos.h
similarity index 100%
rename from src/BulletMultiThreaded/vectormath/scalar/cpp/quat_aos.h
rename to src/BulletMultiThreaded/vectormath/scalar/quat_aos.h
diff --git a/src/BulletMultiThreaded/vectormath/scalar/cpp/vec_aos.h b/src/BulletMultiThreaded/vectormath/scalar/vec_aos.h
similarity index 100%
rename from src/BulletMultiThreaded/vectormath/scalar/cpp/vec_aos.h
rename to src/BulletMultiThreaded/vectormath/scalar/vec_aos.h
diff --git a/src/BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h b/src/BulletMultiThreaded/vectormath/scalar/vectormath_aos.h
similarity index 100%
rename from src/BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h
rename to src/BulletMultiThreaded/vectormath/scalar/vectormath_aos.h
diff --git a/src/BulletMultiThreaded/vectormath/sse/boolInVec.h b/src/BulletMultiThreaded/vectormath/sse/boolInVec.h
new file mode 100644
index 000000000..d18cb15ce
--- /dev/null
+++ b/src/BulletMultiThreaded/vectormath/sse/boolInVec.h
@@ -0,0 +1,247 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _BOOLINVEC_H
+#define _BOOLINVEC_H
+
+#include <math.h>
+
+namespace Vectormath {
+
+class floatInVec;
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec class
+//
+
+class boolInVec
+{
+    private:
+        __m128 mData;
+
+        inline boolInVec(__m128 vec);
+    public:
+        inline boolInVec() {}
+
+        // matches standard type conversions
+        //
+        inline boolInVec(const floatInVec &vec);
+
+        // explicit cast from bool
+        //
+        explicit inline boolInVec(bool scalar);
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+        // explicit cast to bool
+        // 
+        inline bool getAsBool() const;
+#else
+        // implicit cast to bool
+        // 
+        inline operator bool() const;
+#endif
+        
+        // get vector data
+        // bool value is splatted across all word slots of vector as 0 (false) or -1 (true)
+        //
+        inline __m128 get128() const;
+
+        // operators
+        //
+        inline const boolInVec operator ! () const;
+        inline boolInVec& operator = (const boolInVec &vec);
+        inline boolInVec& operator &= (const boolInVec &vec);
+        inline boolInVec& operator ^= (const boolInVec &vec);
+        inline boolInVec& operator |= (const boolInVec &vec);
+
+        // friend functions
+        //
+        friend inline const boolInVec operator == (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator != (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator < (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator <= (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator > (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator >= (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator == (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator != (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const boolInVec operator & (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator ^ (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec operator | (const boolInVec &vec0, const boolInVec &vec1);
+        friend inline const boolInVec select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1);
+};
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec functions
+//
+
+// operators
+//
+inline const boolInVec operator == (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator != (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator & (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator ^ (const boolInVec &vec0, const boolInVec &vec1);
+inline const boolInVec operator | (const boolInVec &vec0, const boolInVec &vec1);
+
+// select between vec0 and vec1 using boolInVec.
+// false selects vec0, true selects vec1
+//
+inline const boolInVec select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1);
+
+} // namespace Vectormath
+
+//--------------------------------------------------------------------------------------------------
+// boolInVec implementation
+//
+
+#include "floatInVec.h"
+
+namespace Vectormath {
+
+inline
+boolInVec::boolInVec(__m128 vec)
+{
+    mData = vec;
+}
+
+inline
+boolInVec::boolInVec(const floatInVec &vec)
+{
+    *this = (vec != floatInVec(0.0f));
+}
+
+inline
+boolInVec::boolInVec(bool scalar)
+{
+    unsigned int mask = -(int)scalar;
+	mData = _mm_set1_ps(*(float *)&mask); // TODO: Union
+}
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+inline
+bool
+boolInVec::getAsBool() const
+#else
+inline
+boolInVec::operator bool() const
+#endif
+{
+	return *(bool *)&mData;
+}
+
+inline
+__m128
+boolInVec::get128() const
+{
+    return mData;
+}
+
+inline
+const boolInVec
+boolInVec::operator ! () const
+{
+    return boolInVec(_mm_andnot_ps(mData, _mm_cmpneq_ps(_mm_setzero_ps(),_mm_setzero_ps())));
+}
+
+inline
+boolInVec&
+boolInVec::operator = (const boolInVec &vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator &= (const boolInVec &vec)
+{
+    *this = *this & vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator ^= (const boolInVec &vec)
+{
+    *this = *this ^ vec;
+    return *this;
+}
+
+inline
+boolInVec&
+boolInVec::operator |= (const boolInVec &vec)
+{
+    *this = *this | vec;
+    return *this;
+}
+
+inline
+const boolInVec
+operator == (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_cmpeq_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator != (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_cmpneq_ps(vec0.get128(), vec1.get128()));
+}
+    
+inline
+const boolInVec
+operator & (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_and_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator | (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_or_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator ^ (const boolInVec &vec0, const boolInVec &vec1)
+{
+	return boolInVec(_mm_xor_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+select(const boolInVec &vec0, const boolInVec &vec1, const boolInVec &select_vec1)
+{
+	return boolInVec(vec_sel(vec0.get128(), vec1.get128(), select_vec1.get128()));
+}
+ 
+} // namespace Vectormath
+
+#endif // boolInVec_h
diff --git a/src/BulletMultiThreaded/vectormath/sse/floatInVec.h b/src/BulletMultiThreaded/vectormath/sse/floatInVec.h
new file mode 100644
index 000000000..6443865b1
--- /dev/null
+++ b/src/BulletMultiThreaded/vectormath/sse/floatInVec.h
@@ -0,0 +1,340 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _FLOATINVEC_H
+#define _FLOATINVEC_H
+
+#include <math.h>
+#include <xmmintrin.h>
+
+namespace Vectormath {
+
+class boolInVec;
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec class
+//
+
+class floatInVec
+{
+    private:
+        __m128 mData;
+
+    public:
+        inline floatInVec(__m128 vec);
+
+        inline floatInVec() {}
+
+        // matches standard type conversions
+        //
+        inline floatInVec(const boolInVec &vec);
+
+        // construct from a slot of __m128
+        //
+        inline floatInVec(__m128 vec, int slot);
+        
+        // explicit cast from float
+        //
+        explicit inline floatInVec(float scalar);
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+        // explicit cast to float
+        // 
+        inline float getAsFloat() const;
+#else
+        // implicit cast to float
+        //
+        inline operator float() const;
+#endif
+
+        // get vector data
+        // float value is splatted across all word slots of vector
+        //
+        inline __m128 get128() const;
+
+        // operators
+        // 
+        inline const floatInVec operator ++ (int);
+        inline const floatInVec operator -- (int);
+        inline floatInVec& operator ++ ();
+        inline floatInVec& operator -- ();
+        inline const floatInVec operator - () const;
+        inline floatInVec& operator = (const floatInVec &vec);
+        inline floatInVec& operator *= (const floatInVec &vec);
+        inline floatInVec& operator /= (const floatInVec &vec);
+        inline floatInVec& operator += (const floatInVec &vec);
+        inline floatInVec& operator -= (const floatInVec &vec);
+
+        // friend functions
+        //
+        friend inline const floatInVec operator * (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec operator / (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec operator + (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec operator - (const floatInVec &vec0, const floatInVec &vec1);
+        friend inline const floatInVec select(const floatInVec &vec0, const floatInVec &vec1, boolInVec select_vec1);
+};
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec functions
+//
+
+// operators
+// 
+inline const floatInVec operator * (const floatInVec &vec0, const floatInVec &vec1);
+inline const floatInVec operator / (const floatInVec &vec0, const floatInVec &vec1);
+inline const floatInVec operator + (const floatInVec &vec0, const floatInVec &vec1);
+inline const floatInVec operator - (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator < (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator <= (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator > (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator >= (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator == (const floatInVec &vec0, const floatInVec &vec1);
+inline const boolInVec operator != (const floatInVec &vec0, const floatInVec &vec1);
+
+// select between vec0 and vec1 using boolInVec.
+// false selects vec0, true selects vec1
+//
+inline const floatInVec select(const floatInVec &vec0, const floatInVec &vec1, const boolInVec &select_vec1);
+
+} // namespace Vectormath
+
+//--------------------------------------------------------------------------------------------------
+// floatInVec implementation
+//
+
+#include "boolInVec.h"
+
+namespace Vectormath {
+
+inline
+floatInVec::floatInVec(__m128 vec)
+{
+    mData = vec;
+}
+
+inline
+floatInVec::floatInVec(const boolInVec &vec)
+{
+	mData = vec_sel(_mm_setzero_ps(), _mm_set1_ps(1.0f), vec.get128());
+}
+
+inline
+floatInVec::floatInVec(__m128 vec, int slot)
+{
+	SSEFloat v;
+	v.m128 = vec;
+	mData = _mm_set1_ps(v.f[slot]);
+}
+
+inline
+floatInVec::floatInVec(float scalar)
+{
+	mData = _mm_set1_ps(scalar);
+}
+
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+inline
+float
+floatInVec::getAsFloat() const
+#else
+inline
+floatInVec::operator float() const
+#endif
+{
+    return *((float *)&mData);
+}
+
+inline
+__m128
+floatInVec::get128() const
+{
+    return mData;
+}
+
+inline
+const floatInVec
+floatInVec::operator ++ (int)
+{
+    __m128 olddata = mData;
+    operator ++();
+    return floatInVec(olddata);
+}
+
+inline
+const floatInVec
+floatInVec::operator -- (int)
+{
+    __m128 olddata = mData;
+    operator --();
+    return floatInVec(olddata);
+}
+
+inline
+floatInVec&
+floatInVec::operator ++ ()
+{
+    *this += floatInVec(_mm_set1_ps(1.0f));
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -- ()
+{
+    *this -= floatInVec(_mm_set1_ps(1.0f));
+    return *this;
+}
+
+inline
+const floatInVec
+floatInVec::operator - () const
+{
+    return floatInVec(_mm_sub_ps(_mm_setzero_ps(), mData));
+}
+
+inline
+floatInVec&
+floatInVec::operator = (const floatInVec &vec)
+{
+    mData = vec.mData;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator *= (const floatInVec &vec)
+{
+    *this = *this * vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator /= (const floatInVec &vec)
+{
+    *this = *this / vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator += (const floatInVec &vec)
+{
+    *this = *this + vec;
+    return *this;
+}
+
+inline
+floatInVec&
+floatInVec::operator -= (const floatInVec &vec)
+{
+    *this = *this - vec;
+    return *this;
+}
+
+inline
+const floatInVec
+operator * (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return floatInVec(_mm_mul_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const floatInVec
+operator / (const floatInVec &num, const floatInVec &den)
+{
+    return floatInVec(_mm_div_ps(num.get128(), den.get128()));
+}
+
+inline
+const floatInVec
+operator + (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return floatInVec(_mm_add_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const floatInVec
+operator - (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return floatInVec(_mm_sub_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator < (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpgt_ps(vec1.get128(), vec0.get128()));
+}
+
+inline
+const boolInVec
+operator <= (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpge_ps(vec1.get128(), vec0.get128()));
+}
+
+inline
+const boolInVec
+operator > (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpgt_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator >= (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpge_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator == (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpeq_ps(vec0.get128(), vec1.get128()));
+}
+
+inline
+const boolInVec
+operator != (const floatInVec &vec0, const floatInVec &vec1)
+{
+    return boolInVec(_mm_cmpneq_ps(vec0.get128(), vec1.get128()));
+}
+    
+inline
+const floatInVec
+select(const floatInVec &vec0, const floatInVec &vec1, const boolInVec &select_vec1)
+{
+    return floatInVec(vec_sel(vec0.get128(), vec1.get128(), select_vec1.get128()));
+}
+
+} // namespace Vectormath
+
+#endif // floatInVec_h
diff --git a/src/BulletMultiThreaded/vectormath/sse/mat_aos.h b/src/BulletMultiThreaded/vectormath/sse/mat_aos.h
new file mode 100644
index 000000000..2ba27bf2a
--- /dev/null
+++ b/src/BulletMultiThreaded/vectormath/sse/mat_aos.h
@@ -0,0 +1,2190 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#ifndef _VECTORMATH_MAT_AOS_CPP_H
+#define _VECTORMATH_MAT_AOS_CPP_H
+
+namespace Vectormath {
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// Constants
+// for shuffles, words are labeled [x,y,z,w] [a,b,c,d]
+
+#define _VECTORMATH_PERM_ZBWX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_B, _VECTORMATH_PERM_W, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_XCYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_C, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_XYAB ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B })
+#define _VECTORMATH_PERM_ZWCD ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_W, _VECTORMATH_PERM_C, _VECTORMATH_PERM_D })
+#define _VECTORMATH_PERM_XZBX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_B, _VECTORMATH_PERM_X })     
+#define _VECTORMATH_PERM_CXXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_YAXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_XAZC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_C })
+#define _VECTORMATH_PERM_YXWZ ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X, _VECTORMATH_PERM_W, _VECTORMATH_PERM_Z })
+#define _VECTORMATH_PERM_YBWD ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_B, _VECTORMATH_PERM_W, _VECTORMATH_PERM_D })
+#define _VECTORMATH_PERM_XYCX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_YCXY ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y })
+#define _VECTORMATH_PERM_CXYC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_C, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_C })
+#define _VECTORMATH_PERM_ZAYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_BZXX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_B, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PERM_XZYA ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A })
+#define _VECTORMATH_PERM_ZXXB ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_B })
+#define _VECTORMATH_PERM_YXXC ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X, _VECTORMATH_PERM_X, _VECTORMATH_PERM_C })
+#define _VECTORMATH_PERM_BBYX ((vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_B, _VECTORMATH_PERM_B, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_X })
+#define _VECTORMATH_PI_OVER_2 1.570796327f
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+__forceinline Matrix3::Matrix3( const Matrix3 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+}
+
+__forceinline Matrix3::Matrix3( float scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+}
+
+__forceinline Matrix3::Matrix3( const floatInVec &scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+}
+
+__forceinline Matrix3::Matrix3( const Quat &unitQuat )
+{
+    __m128 xyzw_2, wwww, yzxw, zxyw, yzxw_2, zxyw_2;
+    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+	__declspec(align(16)) unsigned int sx[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int sz[4] = {0, 0, 0xffffffff, 0};
+	__m128 select_x = _mm_load_ps((float *)sx);
+	__m128 select_z = _mm_load_ps((float *)sz);
+
+    xyzw_2 = _mm_add_ps( unitQuat.get128(), unitQuat.get128() );
+    wwww = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,3,3,3) );
+	yzxw = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,0,2,1) );
+	zxyw = _mm_shuffle_ps( unitQuat.get128(), unitQuat.get128(), _MM_SHUFFLE(3,1,0,2) );
+    yzxw_2 = _mm_shuffle_ps( xyzw_2, xyzw_2, _MM_SHUFFLE(3,0,2,1) );
+    zxyw_2 = _mm_shuffle_ps( xyzw_2, xyzw_2, _MM_SHUFFLE(3,1,0,2) );
+
+    tmp0 = _mm_mul_ps( yzxw_2, wwww );									// tmp0 = 2yw, 2zw, 2xw, 2w2
+	tmp1 = _mm_sub_ps( _mm_set1_ps(1.0f), _mm_mul_ps(yzxw, yzxw_2) );	// tmp1 = 1 - 2y2, 1 - 2z2, 1 - 2x2, 1 - 2w2
+    tmp2 = _mm_mul_ps( yzxw, xyzw_2 );									// tmp2 = 2xy, 2yz, 2xz, 2w2
+    tmp0 = _mm_add_ps( _mm_mul_ps(zxyw, xyzw_2), tmp0 );				// tmp0 = 2yw + 2zx, 2zw + 2xy, 2xw + 2yz, 2w2 + 2w2
+    tmp1 = _mm_sub_ps( tmp1, _mm_mul_ps(zxyw, zxyw_2) );				// tmp1 = 1 - 2y2 - 2z2, 1 - 2z2 - 2x2, 1 - 2x2 - 2y2, 1 - 2w2 - 2w2
+    tmp2 = _mm_sub_ps( tmp2, _mm_mul_ps(zxyw_2, wwww) );				// tmp2 = 2xy - 2zw, 2yz - 2xw, 2xz - 2yw, 2w2 -2w2
+
+    tmp3 = vec_sel( tmp0, tmp1, select_x );
+    tmp4 = vec_sel( tmp1, tmp2, select_x );
+    tmp5 = vec_sel( tmp2, tmp0, select_x );
+    mCol0 = Vector3( vec_sel( tmp3, tmp2, select_z ) );
+    mCol1 = Vector3( vec_sel( tmp4, tmp0, select_z ) );
+    mCol2 = Vector3( vec_sel( tmp5, tmp1, select_z ) );
+}
+
+__forceinline Matrix3::Matrix3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+}
+
+__forceinline Matrix3 & Matrix3::setCol0( const Vector3 &_col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+__forceinline Matrix3 & Matrix3::setCol1( const Vector3 &_col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+__forceinline Matrix3 & Matrix3::setCol2( const Vector3 &_col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+__forceinline Matrix3 & Matrix3::setCol( int col, const Vector3 &vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+__forceinline Matrix3 & Matrix3::setRow( int row, const Vector3 &vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    return *this;
+}
+
+__forceinline Matrix3 & Matrix3::setElem( int col, int row, float val )
+{
+    (*this)[col].setElem(row, val);
+    return *this;
+}
+
+__forceinline Matrix3 & Matrix3::setElem( int col, int row, const floatInVec &val )
+{
+    Vector3 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+__forceinline const floatInVec Matrix3::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+__forceinline const Vector3 Matrix3::getCol0( ) const
+{
+    return mCol0;
+}
+
+__forceinline const Vector3 Matrix3::getCol1( ) const
+{
+    return mCol1;
+}
+
+__forceinline const Vector3 Matrix3::getCol2( ) const
+{
+    return mCol2;
+}
+
+__forceinline const Vector3 Matrix3::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+__forceinline const Vector3 Matrix3::getRow( int row ) const
+{
+    return Vector3( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ) );
+}
+
+__forceinline Vector3 & Matrix3::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+__forceinline const Vector3 Matrix3::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+__forceinline Matrix3 & Matrix3::operator =( const Matrix3 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    return *this;
+}
+
+__forceinline const Matrix3 transpose( const Matrix3 & mat )
+{
+    __m128 tmp0, tmp1, res0, res1, res2;
+    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
+    tmp1 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
+    res0 = vec_mergeh( tmp0, mat.getCol1().get128() );
+    //res1 = vec_perm( tmp0, mat.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	res1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
+	res1 = vec_sel(res1, mat.getCol1().get128(), select_y);
+    //res2 = vec_perm( tmp1, mat.getCol1().get128(), _VECTORMATH_PERM_XCYX );
+	res2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
+	res2 = vec_sel(res2, vec_splat(mat.getCol1().get128(), 2), select_y);
+    return Matrix3(
+        Vector3( res0 ),
+        Vector3( res1 ),
+        Vector3( res2 )
+    );
+}
+
+__forceinline const Matrix3 inverse( const Matrix3 & mat )
+{
+    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, dot, invdet, inv0, inv1, inv2;
+    tmp2 = _vmathVfCross( mat.getCol0().get128(), mat.getCol1().get128() );
+    tmp0 = _vmathVfCross( mat.getCol1().get128(), mat.getCol2().get128() );
+    tmp1 = _vmathVfCross( mat.getCol2().get128(), mat.getCol0().get128() );
+    dot = _vmathVfDot3( tmp2, mat.getCol2().get128() );
+    dot = vec_splat( dot, 0 );
+    invdet = recipf4( dot );
+    tmp3 = vec_mergeh( tmp0, tmp2 );
+    tmp4 = vec_mergel( tmp0, tmp2 );
+    inv0 = vec_mergeh( tmp3, tmp1 );
+    //inv1 = vec_perm( tmp3, tmp1, _VECTORMATH_PERM_ZBWX );
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	inv1 = _mm_shuffle_ps( tmp3, tmp3, _MM_SHUFFLE(0,3,2,2));
+	inv1 = vec_sel(inv1, tmp1, select_y);
+    //inv2 = vec_perm( tmp4, tmp1, _VECTORMATH_PERM_XCYX );
+	inv2 = _mm_shuffle_ps( tmp4, tmp4, _MM_SHUFFLE(0,1,1,0));
+	inv2 = vec_sel(inv2, vec_splat(tmp1, 2), select_y);
+    inv0 = vec_mul( inv0, invdet );
+    inv1 = vec_mul( inv1, invdet );
+	inv2 = vec_mul( inv2, invdet );
+    return Matrix3(
+        Vector3( inv0 ),
+        Vector3( inv1 ),
+        Vector3( inv2 )
+    );
+}
+
+__forceinline const floatInVec determinant( const Matrix3 & mat )
+{
+    return dot( mat.getCol2(), cross( mat.getCol0(), mat.getCol1() ) );
+}
+
+__forceinline const Matrix3 Matrix3::operator +( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( mCol0 + mat.mCol0 ),
+        ( mCol1 + mat.mCol1 ),
+        ( mCol2 + mat.mCol2 )
+    );
+}
+
+__forceinline const Matrix3 Matrix3::operator -( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( mCol0 - mat.mCol0 ),
+        ( mCol1 - mat.mCol1 ),
+        ( mCol2 - mat.mCol2 )
+    );
+}
+
+__forceinline Matrix3 & Matrix3::operator +=( const Matrix3 & mat )
+{
+    *this = *this + mat;
+    return *this;
+}
+
+__forceinline Matrix3 & Matrix3::operator -=( const Matrix3 & mat )
+{
+    *this = *this - mat;
+    return *this;
+}
+
+__forceinline const Matrix3 Matrix3::operator -( ) const
+{
+    return Matrix3(
+        ( -mCol0 ),
+        ( -mCol1 ),
+        ( -mCol2 )
+    );
+}
+
+__forceinline const Matrix3 absPerElem( const Matrix3 & mat )
+{
+    return Matrix3(
+        absPerElem( mat.getCol0() ),
+        absPerElem( mat.getCol1() ),
+        absPerElem( mat.getCol2() )
+    );
+}
+
+__forceinline const Matrix3 Matrix3::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+__forceinline const Matrix3 Matrix3::operator *( const floatInVec &scalar ) const
+{
+    return Matrix3(
+        ( mCol0 * scalar ),
+        ( mCol1 * scalar ),
+        ( mCol2 * scalar )
+    );
+}
+
+__forceinline Matrix3 & Matrix3::operator *=( float scalar )
+{
+    return *this *= floatInVec(scalar);
+}
+
+__forceinline Matrix3 & Matrix3::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+__forceinline const Matrix3 operator *( float scalar, const Matrix3 & mat )
+{
+    return floatInVec(scalar) * mat;
+}
+
+__forceinline const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat )
+{
+    return mat * scalar;
+}
+
+__forceinline const Vector3 Matrix3::operator *( const Vector3 &vec ) const
+{
+    __m128 res;
+    __m128 xxxx, yyyy, zzzz;
+    xxxx = vec_splat( vec.get128(), 0 );
+    yyyy = vec_splat( vec.get128(), 1 );
+    zzzz = vec_splat( vec.get128(), 2 );
+    res = vec_mul( mCol0.get128(), xxxx );
+    res = vec_madd( mCol1.get128(), yyyy, res );
+    res = vec_madd( mCol2.get128(), zzzz, res );
+    return Vector3( res );
+}
+
+__forceinline const Matrix3 Matrix3::operator *( const Matrix3 & mat ) const
+{
+    return Matrix3(
+        ( *this * mat.mCol0 ),
+        ( *this * mat.mCol1 ),
+        ( *this * mat.mCol2 )
+    );
+}
+
+__forceinline Matrix3 & Matrix3::operator *=( const Matrix3 & mat )
+{
+    *this = *this * mat;
+    return *this;
+}
+
+__forceinline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 )
+{
+    return Matrix3(
+        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
+        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
+        mulPerElem( mat0.getCol2(), mat1.getCol2() )
+    );
+}
+
+__forceinline const Matrix3 Matrix3::identity( )
+{
+    return Matrix3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( )
+    );
+}
+
+__forceinline const Matrix3 Matrix3::rotationX( float radians )
+{
+    return rotationX( floatInVec(radians) );
+}
+
+__forceinline const Matrix3 Matrix3::rotationX( const floatInVec &radians )
+{
+    __m128 s, c, res1, res2;
+    __m128 zero;
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res1 = vec_sel( zero, c, select_y );
+    res1 = vec_sel( res1, s, select_z );
+    res2 = vec_sel( zero, negatef4(s), select_y );
+    res2 = vec_sel( res2, c, select_z );
+    return Matrix3(
+        Vector3::xAxis( ),
+        Vector3( res1 ),
+        Vector3( res2 )
+    );
+}
+
+__forceinline const Matrix3 Matrix3::rotationY( float radians )
+{
+    return rotationY( floatInVec(radians) );
+}
+
+__forceinline const Matrix3 Matrix3::rotationY( const floatInVec &radians )
+{
+    __m128 s, c, res0, res2;
+    __m128 zero;
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, negatef4(s), select_z );
+    res2 = vec_sel( zero, s, select_x );
+    res2 = vec_sel( res2, c, select_z );
+    return Matrix3(
+        Vector3( res0 ),
+        Vector3::yAxis( ),
+        Vector3( res2 )
+	);
+}
+
+__forceinline const Matrix3 Matrix3::rotationZ( float radians )
+{
+    return rotationZ( floatInVec(radians) );
+}
+
+__forceinline const Matrix3 Matrix3::rotationZ( const floatInVec &radians )
+{
+    __m128 s, c, res0, res1;
+    __m128 zero;
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, s, select_y );
+    res1 = vec_sel( zero, negatef4(s), select_x );
+    res1 = vec_sel( res1, c, select_y );
+    return Matrix3(
+        Vector3( res0 ),
+        Vector3( res1 ),
+        Vector3::zAxis( )
+	);
+}
+
+__forceinline const Matrix3 Matrix3::rotationZYX( const Vector3 &radiansXYZ )
+{
+    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
+    angles = Vector4( radiansXYZ, 0.0f ).get128();
+    sincosf4( angles, &s, &c );
+    negS = negatef4( s );
+    Z0 = vec_mergel( c, s );
+    Z1 = vec_mergel( negS, c );
+	__declspec(align(16)) unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
+    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
+	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
+	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
+    X0 = vec_splat( s, 0 );
+    X1 = vec_splat( c, 0 );
+    tmp = vec_mul( Z0, Y1 );
+    return Matrix3(
+        Vector3( vec_mul( Z0, Y0 ) ),
+        Vector3( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
+        Vector3( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) )
+    );
+}
+
+__forceinline const Matrix3 Matrix3::rotation( float radians, const Vector3 &unitVec )
+{
+    return rotation( floatInVec(radians), unitVec );
+}
+
+__forceinline const Matrix3 Matrix3::rotation( const floatInVec &radians, const Vector3 &unitVec )
+{
+    __m128 axis, s, c, oneMinusC, axisS, negAxisS, xxxx, yyyy, zzzz, tmp0, tmp1, tmp2;
+    axis = unitVec.get128();
+    sincosf4( radians.get128(), &s, &c );
+    xxxx = vec_splat( axis, 0 );
+    yyyy = vec_splat( axis, 1 );
+    zzzz = vec_splat( axis, 2 );
+    oneMinusC = vec_sub( _mm_set1_ps(1.0f), c );
+    axisS = vec_mul( axis, s );
+    negAxisS = negatef4( axisS );
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    //tmp0 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_XZBX );
+	tmp0 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,2,0) );
+	tmp0 = vec_sel(tmp0, vec_splat(negAxisS, 1), select_z);
+    //tmp1 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_CXXX );
+	tmp1 = vec_sel( vec_splat(axisS, 0), vec_splat(negAxisS, 2), select_x );
+    //tmp2 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_YAXX );
+	tmp2 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,0,1) );
+	tmp2 = vec_sel(tmp2, vec_splat(negAxisS, 0), select_y);
+    tmp0 = vec_sel( tmp0, c, select_x );
+    tmp1 = vec_sel( tmp1, c, select_y );
+    tmp2 = vec_sel( tmp2, c, select_z );
+    return Matrix3(
+        Vector3( vec_madd( vec_mul( axis, xxxx ), oneMinusC, tmp0 ) ),
+        Vector3( vec_madd( vec_mul( axis, yyyy ), oneMinusC, tmp1 ) ),
+        Vector3( vec_madd( vec_mul( axis, zzzz ), oneMinusC, tmp2 ) )
+    );
+}
+
+__forceinline const Matrix3 Matrix3::rotation( const Quat &unitQuat )
+{
+    return Matrix3( unitQuat );
+}
+
+__forceinline const Matrix3 Matrix3::scale( const Vector3 &scaleVec )
+{
+    __m128 zero = _mm_setzero_ps();
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    return Matrix3(
+        Vector3( vec_sel( zero, scaleVec.get128(), select_x ) ),
+        Vector3( vec_sel( zero, scaleVec.get128(), select_y ) ),
+        Vector3( vec_sel( zero, scaleVec.get128(), select_z ) )
+    );
+}
+
+__forceinline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec )
+{
+    return Matrix3(
+        ( mat.getCol0() * scaleVec.getX( ) ),
+        ( mat.getCol1() * scaleVec.getY( ) ),
+        ( mat.getCol2() * scaleVec.getZ( ) )
+    );
+}
+
+__forceinline const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat )
+{
+    return Matrix3(
+        mulPerElem( mat.getCol0(), scaleVec ),
+        mulPerElem( mat.getCol1(), scaleVec ),
+        mulPerElem( mat.getCol2(), scaleVec )
+    );
+}
+
+__forceinline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 )
+{
+    return Matrix3(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 )
+    );
+}
+
+__forceinline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 )
+{
+    return Matrix3(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+__forceinline void print( const Matrix3 & mat )
+{
+    print( mat.getRow( 0 ) );
+    print( mat.getRow( 1 ) );
+    print( mat.getRow( 2 ) );
+}
+
+__forceinline void print( const Matrix3 & mat, const char * name )
+{
+    printf("%s:\n", name);
+    print( mat );
+}
+
+#endif
+
+__forceinline Matrix4::Matrix4( const Matrix4 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    mCol3 = mat.mCol3;
+}
+
+__forceinline Matrix4::Matrix4( float scalar )
+{
+    mCol0 = Vector4( scalar );
+    mCol1 = Vector4( scalar );
+    mCol2 = Vector4( scalar );
+    mCol3 = Vector4( scalar );
+}
+
+__forceinline Matrix4::Matrix4( const floatInVec &scalar )
+{
+    mCol0 = Vector4( scalar );
+    mCol1 = Vector4( scalar );
+    mCol2 = Vector4( scalar );
+    mCol3 = Vector4( scalar );
+}
+
+__forceinline Matrix4::Matrix4( const Transform3 & mat )
+{
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( mat.getCol3(), 1.0f );
+}
+
+__forceinline Matrix4::Matrix4( const Vector4 &_col0, const Vector4 &_col1, const Vector4 &_col2, const Vector4 &_col3 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+    mCol3 = _col3;
+}
+
+__forceinline Matrix4::Matrix4( const Matrix3 & mat, const Vector3 &translateVec )
+{
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( translateVec, 1.0f );
+}
+
+__forceinline Matrix4::Matrix4( const Quat &unitQuat, const Vector3 &translateVec )
+{
+    Matrix3 mat;
+    mat = Matrix3( unitQuat );
+    mCol0 = Vector4( mat.getCol0(), 0.0f );
+    mCol1 = Vector4( mat.getCol1(), 0.0f );
+    mCol2 = Vector4( mat.getCol2(), 0.0f );
+    mCol3 = Vector4( translateVec, 1.0f );
+}
+
+__forceinline Matrix4 & Matrix4::setCol0( const Vector4 &_col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+__forceinline Matrix4 & Matrix4::setCol1( const Vector4 &_col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+__forceinline Matrix4 & Matrix4::setCol2( const Vector4 &_col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+__forceinline Matrix4 & Matrix4::setCol3( const Vector4 &_col3 )
+{
+    mCol3 = _col3;
+    return *this;
+}
+
+__forceinline Matrix4 & Matrix4::setCol( int col, const Vector4 &vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+__forceinline Matrix4 & Matrix4::setRow( int row, const Vector4 &vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    mCol3.setElem( row, vec.getElem( 3 ) );
+    return *this;
+}
+
+__forceinline Matrix4 & Matrix4::setElem( int col, int row, float val )
+{
+    (*this)[col].setElem(row, val);
+    return *this;
+}
+
+__forceinline Matrix4 & Matrix4::setElem( int col, int row, const floatInVec &val )
+{
+    Vector4 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+__forceinline const floatInVec Matrix4::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+__forceinline const Vector4 Matrix4::getCol0( ) const
+{
+    return mCol0;
+}
+
+__forceinline const Vector4 Matrix4::getCol1( ) const
+{
+    return mCol1;
+}
+
+__forceinline const Vector4 Matrix4::getCol2( ) const
+{
+    return mCol2;
+}
+
+__forceinline const Vector4 Matrix4::getCol3( ) const
+{
+    return mCol3;
+}
+
+__forceinline const Vector4 Matrix4::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+__forceinline const Vector4 Matrix4::getRow( int row ) const
+{
+    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
+}
+
+__forceinline Vector4 & Matrix4::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+__forceinline const Vector4 Matrix4::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+__forceinline Matrix4 & Matrix4::operator =( const Matrix4 & mat )
+{
+    mCol0 = mat.mCol0;
+    mCol1 = mat.mCol1;
+    mCol2 = mat.mCol2;
+    mCol3 = mat.mCol3;
+    return *this;
+}
+
+__forceinline const Matrix4 transpose( const Matrix4 & mat )
+{
+    __m128 tmp0, tmp1, tmp2, tmp3, res0, res1, res2, res3;
+    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
+    tmp1 = vec_mergeh( mat.getCol1().get128(), mat.getCol3().get128() );
+    tmp2 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
+    tmp3 = vec_mergel( mat.getCol1().get128(), mat.getCol3().get128() );
+    res0 = vec_mergeh( tmp0, tmp1 );
+    res1 = vec_mergel( tmp0, tmp1 );
+    res2 = vec_mergeh( tmp2, tmp3 );
+    res3 = vec_mergel( tmp2, tmp3 );
+    return Matrix4(
+        Vector4( res0 ),
+        Vector4( res1 ),
+        Vector4( res2 ),
+        Vector4( res3 )
+    );
+}
+
+// TODO: Tidy
+static __declspec(align(16)) const unsigned int _vmathPNPN[4] = {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+static __declspec(align(16)) const unsigned int _vmathNPNP[4] = {0x80000000, 0x00000000, 0x80000000, 0x00000000};
+static __declspec(align(16)) const float _vmathZERONE[4] = {1.0f, 0.0f, 0.0f, 1.0f};
+
+__forceinline const Matrix4 inverse( const Matrix4 & mat )
+{
+	__m128 Va,Vb,Vc;
+	__m128 r1,r2,r3,tt,tt2;
+	__m128 sum,Det,RDet;
+	__m128 trns0,trns1,trns2,trns3;
+
+	__m128 _L1 = mat.getCol0().get128();
+	__m128 _L2 = mat.getCol1().get128();
+	__m128 _L3 = mat.getCol2().get128();
+	__m128 _L4 = mat.getCol3().get128();
+	// Calculating the minterms for the first line.
+
+	// _mm_ror_ps is just a macro using _mm_shuffle_ps().
+	tt = _L4; tt2 = _mm_ror_ps(_L3,1); 
+	Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));					// V3'�V4
+	Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));					// V3'�V4"
+	Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));					// V3'�V4^
+
+	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V3"�V4^ - V3^�V4"
+	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V3^�V4' - V3'�V4^
+	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V3'�V4" - V3"�V4'
+
+	tt = _L2;
+	Va = _mm_ror_ps(tt,1);		sum = _mm_mul_ps(Va,r1);
+	Vb = _mm_ror_ps(tt,2);		sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
+	Vc = _mm_ror_ps(tt,3);		sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
+
+	// Calculating the determinant.
+	Det = _mm_mul_ps(sum,_L1);
+	Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
+
+	const __m128 Sign_PNPN = _mm_load_ps((float *)_vmathPNPN);
+	const __m128 Sign_NPNP = _mm_load_ps((float *)_vmathNPNP);
+
+	__m128 mtL1 = _mm_xor_ps(sum,Sign_PNPN);
+
+	// Calculating the minterms of the second line (using previous results).
+	tt = _mm_ror_ps(_L1,1);		sum = _mm_mul_ps(tt,r1);
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+	__m128 mtL2 = _mm_xor_ps(sum,Sign_NPNP);
+
+	// Testing the determinant.
+	Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
+
+	// Calculating the minterms of the third line.
+	tt = _mm_ror_ps(_L1,1);
+	Va = _mm_mul_ps(tt,Vb);									// V1'�V2"
+	Vb = _mm_mul_ps(tt,Vc);									// V1'�V2^
+	Vc = _mm_mul_ps(tt,_L2);								// V1'�V2
+
+	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V1"�V2^ - V1^�V2"
+	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V1^�V2' - V1'�V2^
+	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V1'�V2" - V1"�V2'
+
+	tt = _mm_ror_ps(_L4,1);		sum = _mm_mul_ps(tt,r1);
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+	__m128 mtL3 = _mm_xor_ps(sum,Sign_PNPN);
+
+	// Dividing is FASTER than rcp_nr! (Because rcp_nr causes many register-memory RWs).
+	RDet = _mm_div_ss(_mm_load_ss((float *)&_vmathZERONE), Det); // TODO: just 1.0f?
+	RDet = _mm_shuffle_ps(RDet,RDet,0x00);
+
+	// Devide the first 12 minterms with the determinant.
+	mtL1 = _mm_mul_ps(mtL1, RDet);
+	mtL2 = _mm_mul_ps(mtL2, RDet);
+	mtL3 = _mm_mul_ps(mtL3, RDet);
+
+	// Calculate the minterms of the forth line and devide by the determinant.
+	tt = _mm_ror_ps(_L3,1);		sum = _mm_mul_ps(tt,r1);
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+	__m128 mtL4 = _mm_xor_ps(sum,Sign_NPNP);
+	mtL4 = _mm_mul_ps(mtL4, RDet);
+
+	// Now we just have to transpose the minterms matrix.
+	trns0 = _mm_unpacklo_ps(mtL1,mtL2);
+	trns1 = _mm_unpacklo_ps(mtL3,mtL4);
+	trns2 = _mm_unpackhi_ps(mtL1,mtL2);
+	trns3 = _mm_unpackhi_ps(mtL3,mtL4);
+	_L1 = _mm_movelh_ps(trns0,trns1);
+	_L2 = _mm_movehl_ps(trns1,trns0);
+	_L3 = _mm_movelh_ps(trns2,trns3);
+	_L4 = _mm_movehl_ps(trns3,trns2);
+
+    return Matrix4(
+        Vector4( _L1 ),
+        Vector4( _L2 ),
+        Vector4( _L3 ),
+        Vector4( _L4 )
+    );
+}
+
+__forceinline const Matrix4 affineInverse( const Matrix4 & mat )
+{
+    Transform3 affineMat;
+    affineMat.setCol0( mat.getCol0().getXYZ( ) );
+    affineMat.setCol1( mat.getCol1().getXYZ( ) );
+    affineMat.setCol2( mat.getCol2().getXYZ( ) );
+    affineMat.setCol3( mat.getCol3().getXYZ( ) );
+    return Matrix4( inverse( affineMat ) );
+}
+
+__forceinline const Matrix4 orthoInverse( const Matrix4 & mat )
+{
+    Transform3 affineMat;
+    affineMat.setCol0( mat.getCol0().getXYZ( ) );
+    affineMat.setCol1( mat.getCol1().getXYZ( ) );
+    affineMat.setCol2( mat.getCol2().getXYZ( ) );
+    affineMat.setCol3( mat.getCol3().getXYZ( ) );
+    return Matrix4( orthoInverse( affineMat ) );
+}
+
+__forceinline const floatInVec determinant( const Matrix4 & mat )
+{
+	__m128 Va,Vb,Vc;
+	__m128 r1,r2,r3,tt,tt2;
+	__m128 sum,Det;
+
+	__m128 _L1 = mat.getCol0().get128();
+	__m128 _L2 = mat.getCol1().get128();
+	__m128 _L3 = mat.getCol2().get128();
+	__m128 _L4 = mat.getCol3().get128();
+	// Calculating the minterms for the first line.
+
+	// _mm_ror_ps is just a macro using _mm_shuffle_ps().
+	tt = _L4; tt2 = _mm_ror_ps(_L3,1); 
+	Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));					// V3'�V4
+	Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));					// V3'�V4"
+	Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));					// V3'�V4^
+
+	r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V3"�V4^ - V3^�V4"
+	r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V3^�V4' - V3'�V4^
+	r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V3'�V4" - V3"�V4'
+
+	tt = _L2;
+	Va = _mm_ror_ps(tt,1);		sum = _mm_mul_ps(Va,r1);
+	Vb = _mm_ror_ps(tt,2);		sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
+	Vc = _mm_ror_ps(tt,3);		sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
+
+	// Calculating the determinant.
+	Det = _mm_mul_ps(sum,_L1);
+	Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
+
+	// Calculating the minterms of the second line (using previous results).
+	tt = _mm_ror_ps(_L1,1);		sum = _mm_mul_ps(tt,r1);
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+	tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+
+	// Testing the determinant.
+	Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
+	return floatInVec(Det, 0);
+}
+
+__forceinline const Matrix4 Matrix4::operator +( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( mCol0 + mat.mCol0 ),
+        ( mCol1 + mat.mCol1 ),
+        ( mCol2 + mat.mCol2 ),
+        ( mCol3 + mat.mCol3 )
+    );
+}
+
+__forceinline const Matrix4 Matrix4::operator -( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( mCol0 - mat.mCol0 ),
+        ( mCol1 - mat.mCol1 ),
+        ( mCol2 - mat.mCol2 ),
+        ( mCol3 - mat.mCol3 )
+    );
+}
+
+__forceinline Matrix4 & Matrix4::operator +=( const Matrix4 & mat )
+{
+    *this = *this + mat;
+    return *this;
+}
+
+__forceinline Matrix4 & Matrix4::operator -=( const Matrix4 & mat )
+{
+    *this = *this - mat;
+    return *this;
+}
+
+__forceinline const Matrix4 Matrix4::operator -( ) const
+{
+    return Matrix4(
+        ( -mCol0 ),
+        ( -mCol1 ),
+        ( -mCol2 ),
+        ( -mCol3 )
+    );
+}
+
+__forceinline const Matrix4 absPerElem( const Matrix4 & mat )
+{
+    return Matrix4(
+        absPerElem( mat.getCol0() ),
+        absPerElem( mat.getCol1() ),
+        absPerElem( mat.getCol2() ),
+        absPerElem( mat.getCol3() )
+    );
+}
+
+__forceinline const Matrix4 Matrix4::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+__forceinline const Matrix4 Matrix4::operator *( const floatInVec &scalar ) const
+{
+    return Matrix4(
+        ( mCol0 * scalar ),
+        ( mCol1 * scalar ),
+        ( mCol2 * scalar ),
+        ( mCol3 * scalar )
+    );
+}
+
+__forceinline Matrix4 & Matrix4::operator *=( float scalar )
+{
+    return *this *= floatInVec(scalar);
+}
+
+__forceinline Matrix4 & Matrix4::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+__forceinline const Matrix4 operator *( float scalar, const Matrix4 & mat )
+{
+    return floatInVec(scalar) * mat;
+}
+
+__forceinline const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat )
+{
+    return mat * scalar;
+}
+
+__forceinline const Vector4 Matrix4::operator *( const Vector4 &vec ) const
+{
+    return Vector4(
+		_mm_add_ps(
+			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(1,1,1,1)))),
+			_mm_add_ps(_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(2,2,2,2))), _mm_mul_ps(mCol3.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(3,3,3,3)))))
+		);
+}
+
+__forceinline const Vector4 Matrix4::operator *( const Vector3 &vec ) const
+{
+    return Vector4(
+		_mm_add_ps(
+			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(1,1,1,1)))),
+			_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(vec.get128(), vec.get128(), _MM_SHUFFLE(2,2,2,2))))
+		);
+}
+
+__forceinline const Vector4 Matrix4::operator *( const Point3 &pnt ) const
+{
+    return Vector4(
+		_mm_add_ps(
+			_mm_add_ps(_mm_mul_ps(mCol0.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(mCol1.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(1,1,1,1)))),
+			_mm_add_ps(_mm_mul_ps(mCol2.get128(), _mm_shuffle_ps(pnt.get128(), pnt.get128(), _MM_SHUFFLE(2,2,2,2))), mCol3.get128()))
+		);
+}
+
+__forceinline const Matrix4 Matrix4::operator *( const Matrix4 & mat ) const
+{
+    return Matrix4(
+        ( *this * mat.mCol0 ),
+        ( *this * mat.mCol1 ),
+        ( *this * mat.mCol2 ),
+        ( *this * mat.mCol3 )
+    );
+}
+
+__forceinline Matrix4 & Matrix4::operator *=( const Matrix4 & mat )
+{
+    *this = *this * mat;
+    return *this;
+}
+
+__forceinline const Matrix4 Matrix4::operator *( const Transform3 & tfrm ) const
+{
+    return Matrix4(
+        ( *this * tfrm.getCol0() ),
+        ( *this * tfrm.getCol1() ),
+        ( *this * tfrm.getCol2() ),
+        ( *this * Point3( tfrm.getCol3() ) )
+    );
+}
+
+__forceinline Matrix4 & Matrix4::operator *=( const Transform3 & tfrm )
+{
+    *this = *this * tfrm;
+    return *this;
+}
+
+__forceinline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 )
+{
+    return Matrix4(
+        mulPerElem( mat0.getCol0(), mat1.getCol0() ),
+        mulPerElem( mat0.getCol1(), mat1.getCol1() ),
+        mulPerElem( mat0.getCol2(), mat1.getCol2() ),
+        mulPerElem( mat0.getCol3(), mat1.getCol3() )
+    );
+}
+
+__forceinline const Matrix4 Matrix4::identity( )
+{
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4::yAxis( ),
+        Vector4::zAxis( ),
+        Vector4::wAxis( )
+    );
+}
+
+__forceinline Matrix4 & Matrix4::setUpper3x3( const Matrix3 & mat3 )
+{
+    mCol0.setXYZ( mat3.getCol0() );
+    mCol1.setXYZ( mat3.getCol1() );
+    mCol2.setXYZ( mat3.getCol2() );
+    return *this;
+}
+
+__forceinline const Matrix3 Matrix4::getUpper3x3( ) const
+{
+    return Matrix3(
+        mCol0.getXYZ( ),
+        mCol1.getXYZ( ),
+        mCol2.getXYZ( )
+    );
+}
+
+__forceinline Matrix4 & Matrix4::setTranslation( const Vector3 &translateVec )
+{
+    mCol3.setXYZ( translateVec );
+    return *this;
+}
+
+__forceinline const Vector3 Matrix4::getTranslation( ) const
+{
+    return mCol3.getXYZ( );
+}
+
+__forceinline const Matrix4 Matrix4::rotationX( float radians )
+{
+    return rotationX( floatInVec(radians) );
+}
+
+__forceinline const Matrix4 Matrix4::rotationX( const floatInVec &radians )
+{
+    __m128 s, c, res1, res2;
+    __m128 zero;
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res1 = vec_sel( zero, c, select_y );
+    res1 = vec_sel( res1, s, select_z );
+    res2 = vec_sel( zero, negatef4(s), select_y );
+    res2 = vec_sel( res2, c, select_z );
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4( res1 ),
+        Vector4( res2 ),
+        Vector4::wAxis( )
+    );
+}
+
+__forceinline const Matrix4 Matrix4::rotationY( float radians )
+{
+    return rotationY( floatInVec(radians) );
+}
+
+__forceinline const Matrix4 Matrix4::rotationY( const floatInVec &radians )
+{
+    __m128 s, c, res0, res2;
+    __m128 zero;
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, negatef4(s), select_z );
+    res2 = vec_sel( zero, s, select_x );
+    res2 = vec_sel( res2, c, select_z );
+    return Matrix4(
+        Vector4( res0 ),
+        Vector4::yAxis( ),
+        Vector4( res2 ),
+        Vector4::wAxis( )
+    );
+}
+
+__forceinline const Matrix4 Matrix4::rotationZ( float radians )
+{
+    return rotationZ( floatInVec(radians) );
+}
+
+__forceinline const Matrix4 Matrix4::rotationZ( const floatInVec &radians )
+{
+    __m128 s, c, res0, res1;
+    __m128 zero;
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, s, select_y );
+    res1 = vec_sel( zero, negatef4(s), select_x );
+    res1 = vec_sel( res1, c, select_y );
+    return Matrix4(
+        Vector4( res0 ),
+        Vector4( res1 ),
+        Vector4::zAxis( ),
+        Vector4::wAxis( )
+    );
+}
+
+__forceinline const Matrix4 Matrix4::rotationZYX( const Vector3 &radiansXYZ )
+{
+    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
+    angles = Vector4( radiansXYZ, 0.0f ).get128();
+    sincosf4( angles, &s, &c );
+    negS = negatef4( s );
+    Z0 = vec_mergel( c, s );
+    Z1 = vec_mergel( negS, c );
+	__declspec(align(16)) unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
+    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
+	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
+	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
+    X0 = vec_splat( s, 0 );
+    X1 = vec_splat( c, 0 );
+    tmp = vec_mul( Z0, Y1 );
+    return Matrix4(
+        Vector4( vec_mul( Z0, Y0 ) ),
+        Vector4( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
+        Vector4( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) ),
+        Vector4::wAxis( )
+    );
+}
+
+__forceinline const Matrix4 Matrix4::rotation( float radians, const Vector3 &unitVec )
+{
+    return rotation( floatInVec(radians), unitVec );
+}
+
+__forceinline const Matrix4 Matrix4::rotation( const floatInVec &radians, const Vector3 &unitVec )
+{
+    __m128 axis, s, c, oneMinusC, axisS, negAxisS, xxxx, yyyy, zzzz, tmp0, tmp1, tmp2;
+    axis = unitVec.get128();
+    sincosf4( radians.get128(), &s, &c );
+    xxxx = vec_splat( axis, 0 );
+    yyyy = vec_splat( axis, 1 );
+    zzzz = vec_splat( axis, 2 );
+    oneMinusC = vec_sub( _mm_set1_ps(1.0f), c );
+    axisS = vec_mul( axis, s );
+    negAxisS = negatef4( axisS );
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    //tmp0 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_XZBX );
+	tmp0 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,2,0) );
+	tmp0 = vec_sel(tmp0, vec_splat(negAxisS, 1), select_z);
+    //tmp1 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_CXXX );
+	tmp1 = vec_sel( vec_splat(axisS, 0), vec_splat(negAxisS, 2), select_x );
+    //tmp2 = vec_perm( axisS, negAxisS, _VECTORMATH_PERM_YAXX );
+	tmp2 = _mm_shuffle_ps( axisS, axisS, _MM_SHUFFLE(0,0,0,1) );
+	tmp2 = vec_sel(tmp2, vec_splat(negAxisS, 0), select_y);
+    tmp0 = vec_sel( tmp0, c, select_x );
+    tmp1 = vec_sel( tmp1, c, select_y );
+    tmp2 = vec_sel( tmp2, c, select_z );
+	__declspec(align(16)) unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
+    axis = vec_and( axis, _mm_load_ps( (float *)select_xyz ) );
+    tmp0 = vec_and( tmp0, _mm_load_ps( (float *)select_xyz ) );
+    tmp1 = vec_and( tmp1, _mm_load_ps( (float *)select_xyz ) );
+    tmp2 = vec_and( tmp2, _mm_load_ps( (float *)select_xyz ) );
+    return Matrix4(
+        Vector4( vec_madd( vec_mul( axis, xxxx ), oneMinusC, tmp0 ) ),
+        Vector4( vec_madd( vec_mul( axis, yyyy ), oneMinusC, tmp1 ) ),
+        Vector4( vec_madd( vec_mul( axis, zzzz ), oneMinusC, tmp2 ) ),
+        Vector4::wAxis( )
+    );
+}
+
+__forceinline const Matrix4 Matrix4::rotation( const Quat &unitQuat )
+{
+    return Matrix4( Transform3::rotation( unitQuat ) );
+}
+
+__forceinline const Matrix4 Matrix4::scale( const Vector3 &scaleVec )
+{
+    __m128 zero = _mm_setzero_ps();
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    return Matrix4(
+        Vector4( vec_sel( zero, scaleVec.get128(), select_x ) ),
+        Vector4( vec_sel( zero, scaleVec.get128(), select_y ) ),
+        Vector4( vec_sel( zero, scaleVec.get128(), select_z ) ),
+        Vector4::wAxis( )
+    );
+}
+
+__forceinline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec )
+{
+    return Matrix4(
+        ( mat.getCol0() * scaleVec.getX( ) ),
+        ( mat.getCol1() * scaleVec.getY( ) ),
+        ( mat.getCol2() * scaleVec.getZ( ) ),
+        mat.getCol3()
+    );
+}
+
+__forceinline const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat )
+{
+    Vector4 scale4;
+    scale4 = Vector4( scaleVec, 1.0f );
+    return Matrix4(
+        mulPerElem( mat.getCol0(), scale4 ),
+        mulPerElem( mat.getCol1(), scale4 ),
+        mulPerElem( mat.getCol2(), scale4 ),
+        mulPerElem( mat.getCol3(), scale4 )
+    );
+}
+
+__forceinline const Matrix4 Matrix4::translation( const Vector3 &translateVec )
+{
+    return Matrix4(
+        Vector4::xAxis( ),
+        Vector4::yAxis( ),
+        Vector4::zAxis( ),
+        Vector4( translateVec, 1.0f )
+    );
+}
+
+__forceinline const Matrix4 Matrix4::lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec )
+{
+    Matrix4 m4EyeFrame;
+    Vector3 v3X, v3Y, v3Z;
+    v3Y = normalize( upVec );
+    v3Z = normalize( ( eyePos - lookAtPos ) );
+    v3X = normalize( cross( v3Y, v3Z ) );
+    v3Y = cross( v3Z, v3X );
+    m4EyeFrame = Matrix4( Vector4( v3X ), Vector4( v3Y ), Vector4( v3Z ), Vector4( eyePos ) );
+    return orthoInverse( m4EyeFrame );
+}
+
+__forceinline const Matrix4 Matrix4::perspective( float fovyRadians, float aspect, float zNear, float zFar )
+{
+    float f, rangeInv;
+    __m128 zero, col0, col1, col2, col3;
+    union { __m128 v; float s[4]; } tmp;
+    f = tanf( _VECTORMATH_PI_OVER_2 - fovyRadians * 0.5f );
+    rangeInv = 1.0f / ( zNear - zFar );
+    zero = _mm_setzero_ps();
+    tmp.v = zero;
+    tmp.s[0] = f / aspect;
+    col0 = tmp.v;
+    tmp.v = zero;
+    tmp.s[1] = f;
+    col1 = tmp.v;
+    tmp.v = zero;
+    tmp.s[2] = ( zNear + zFar ) * rangeInv;
+    tmp.s[3] = -1.0f;
+    col2 = tmp.v;
+    tmp.v = zero;
+    tmp.s[2] = zNear * zFar * rangeInv * 2.0f;
+    col3 = tmp.v;
+    return Matrix4(
+        Vector4( col0 ),
+        Vector4( col1 ),
+        Vector4( col2 ),
+        Vector4( col3 )
+    );
+}
+
+__forceinline const Matrix4 Matrix4::frustum( float left, float right, float bottom, float top, float zNear, float zFar )
+{
+    /* function implementation based on code from STIDC SDK:           */
+    /* --------------------------------------------------------------  */
+    /* PLEASE DO NOT MODIFY THIS SECTION                               */
+    /* This prolog section is automatically generated.                 */
+    /*                                                                 */
+    /* (C)Copyright                                                    */
+    /* Sony Computer Entertainment, Inc.,                              */
+    /* Toshiba Corporation,                                            */
+    /* International Business Machines Corporation,                    */
+    /* 2001,2002.                                                      */
+    /* S/T/I Confidential Information                                  */
+    /* --------------------------------------------------------------  */
+    __m128 lbf, rtn;
+    __m128 diff, sum, inv_diff;
+    __m128 diagonal, column, near2;
+    __m128 zero = _mm_setzero_ps();
+    union { __m128 v; float s[4]; } l, f, r, n, b, t; // TODO: Union?
+    l.s[0] = left;
+    f.s[0] = zFar;
+    r.s[0] = right;
+    n.s[0] = zNear;
+    b.s[0] = bottom;
+    t.s[0] = top;
+    lbf = vec_mergeh( l.v, f.v );
+    rtn = vec_mergeh( r.v, n.v );
+    lbf = vec_mergeh( lbf, b.v );
+    rtn = vec_mergeh( rtn, t.v );
+    diff = vec_sub( rtn, lbf );
+    sum  = vec_add( rtn, lbf );
+    inv_diff = recipf4( diff );
+    near2 = vec_splat( n.v, 0 );
+    near2 = vec_add( near2, near2 );
+    diagonal = vec_mul( near2, inv_diff );
+    column = vec_mul( sum, inv_diff );
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+	__declspec(align(16)) unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
+    return Matrix4(
+        Vector4( vec_sel( zero, diagonal, select_x ) ),
+        Vector4( vec_sel( zero, diagonal, select_y ) ),
+        Vector4( vec_sel( column, _mm_set1_ps(-1.0f), select_w ) ),
+        Vector4( vec_sel( zero, vec_mul( diagonal, vec_splat( f.v, 0 ) ), select_z ) )
+	);
+}
+
+__forceinline const Matrix4 Matrix4::orthographic( float left, float right, float bottom, float top, float zNear, float zFar )
+{
+    /* function implementation based on code from STIDC SDK:           */
+    /* --------------------------------------------------------------  */
+    /* PLEASE DO NOT MODIFY THIS SECTION                               */
+    /* This prolog section is automatically generated.                 */
+    /*                                                                 */
+    /* (C)Copyright                                                    */
+    /* Sony Computer Entertainment, Inc.,                              */
+    /* Toshiba Corporation,                                            */
+    /* International Business Machines Corporation,                    */
+    /* 2001,2002.                                                      */
+    /* S/T/I Confidential Information                                  */
+    /* --------------------------------------------------------------  */
+    __m128 lbf, rtn;
+    __m128 diff, sum, inv_diff, neg_inv_diff;
+    __m128 diagonal, column;
+    __m128 zero = _mm_setzero_ps();
+    union { __m128 v; float s[4]; } l, f, r, n, b, t;
+    l.s[0] = left;
+    f.s[0] = zFar;
+    r.s[0] = right;
+    n.s[0] = zNear;
+    b.s[0] = bottom;
+    t.s[0] = top;
+    lbf = vec_mergeh( l.v, f.v );
+    rtn = vec_mergeh( r.v, n.v );
+    lbf = vec_mergeh( lbf, b.v );
+    rtn = vec_mergeh( rtn, t.v );
+    diff = vec_sub( rtn, lbf );
+    sum  = vec_add( rtn, lbf );
+    inv_diff = recipf4( diff );
+    neg_inv_diff = negatef4( inv_diff );
+    diagonal = vec_add( inv_diff, inv_diff );
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+	__declspec(align(16)) unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
+    column = vec_mul( sum, vec_sel( neg_inv_diff, inv_diff, select_z ) ); // TODO: no madds with zero
+    return Matrix4(
+        Vector4( vec_sel( zero, diagonal, select_x ) ),
+        Vector4( vec_sel( zero, diagonal, select_y ) ),
+        Vector4( vec_sel( zero, diagonal, select_z ) ),
+        Vector4( vec_sel( column, _mm_set1_ps(1.0f), select_w ) )
+    );
+}
+
+__forceinline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 )
+{
+    return Matrix4(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 ),
+        select( mat0.getCol3(), mat1.getCol3(), select1 )
+    );
+}
+
+__forceinline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 )
+{
+    return Matrix4(
+        select( mat0.getCol0(), mat1.getCol0(), select1 ),
+        select( mat0.getCol1(), mat1.getCol1(), select1 ),
+        select( mat0.getCol2(), mat1.getCol2(), select1 ),
+        select( mat0.getCol3(), mat1.getCol3(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+__forceinline void print( const Matrix4 & mat )
+{
+    print( mat.getRow( 0 ) );
+    print( mat.getRow( 1 ) );
+    print( mat.getRow( 2 ) );
+    print( mat.getRow( 3 ) );
+}
+
+__forceinline void print( const Matrix4 & mat, const char * name )
+{
+    printf("%s:\n", name);
+    print( mat );
+}
+
+#endif
+
+__forceinline Transform3::Transform3( const Transform3 & tfrm )
+{
+    mCol0 = tfrm.mCol0;
+    mCol1 = tfrm.mCol1;
+    mCol2 = tfrm.mCol2;
+    mCol3 = tfrm.mCol3;
+}
+
+__forceinline Transform3::Transform3( float scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+    mCol3 = Vector3( scalar );
+}
+
+__forceinline Transform3::Transform3( const floatInVec &scalar )
+{
+    mCol0 = Vector3( scalar );
+    mCol1 = Vector3( scalar );
+    mCol2 = Vector3( scalar );
+    mCol3 = Vector3( scalar );
+}
+
+__forceinline Transform3::Transform3( const Vector3 &_col0, const Vector3 &_col1, const Vector3 &_col2, const Vector3 &_col3 )
+{
+    mCol0 = _col0;
+    mCol1 = _col1;
+    mCol2 = _col2;
+    mCol3 = _col3;
+}
+
+__forceinline Transform3::Transform3( const Matrix3 & tfrm, const Vector3 &translateVec )
+{
+    this->setUpper3x3( tfrm );
+    this->setTranslation( translateVec );
+}
+
+__forceinline Transform3::Transform3( const Quat &unitQuat, const Vector3 &translateVec )
+{
+    this->setUpper3x3( Matrix3( unitQuat ) );
+    this->setTranslation( translateVec );
+}
+
+__forceinline Transform3 & Transform3::setCol0( const Vector3 &_col0 )
+{
+    mCol0 = _col0;
+    return *this;
+}
+
+__forceinline Transform3 & Transform3::setCol1( const Vector3 &_col1 )
+{
+    mCol1 = _col1;
+    return *this;
+}
+
+__forceinline Transform3 & Transform3::setCol2( const Vector3 &_col2 )
+{
+    mCol2 = _col2;
+    return *this;
+}
+
+__forceinline Transform3 & Transform3::setCol3( const Vector3 &_col3 )
+{
+    mCol3 = _col3;
+    return *this;
+}
+
+__forceinline Transform3 & Transform3::setCol( int col, const Vector3 &vec )
+{
+    *(&mCol0 + col) = vec;
+    return *this;
+}
+
+__forceinline Transform3 & Transform3::setRow( int row, const Vector4 &vec )
+{
+    mCol0.setElem( row, vec.getElem( 0 ) );
+    mCol1.setElem( row, vec.getElem( 1 ) );
+    mCol2.setElem( row, vec.getElem( 2 ) );
+    mCol3.setElem( row, vec.getElem( 3 ) );
+    return *this;
+}
+
+__forceinline Transform3 & Transform3::setElem( int col, int row, float val )
+{
+    (*this)[col].setElem(row, val);
+    return *this;
+}
+
+__forceinline Transform3 & Transform3::setElem( int col, int row, const floatInVec &val )
+{
+    Vector3 tmpV3_0;
+    tmpV3_0 = this->getCol( col );
+    tmpV3_0.setElem( row, val );
+    this->setCol( col, tmpV3_0 );
+    return *this;
+}
+
+__forceinline const floatInVec Transform3::getElem( int col, int row ) const
+{
+    return this->getCol( col ).getElem( row );
+}
+
+__forceinline const Vector3 Transform3::getCol0( ) const
+{
+    return mCol0;
+}
+
+__forceinline const Vector3 Transform3::getCol1( ) const
+{
+    return mCol1;
+}
+
+__forceinline const Vector3 Transform3::getCol2( ) const
+{
+    return mCol2;
+}
+
+__forceinline const Vector3 Transform3::getCol3( ) const
+{
+    return mCol3;
+}
+
+__forceinline const Vector3 Transform3::getCol( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+__forceinline const Vector4 Transform3::getRow( int row ) const
+{
+    return Vector4( mCol0.getElem( row ), mCol1.getElem( row ), mCol2.getElem( row ), mCol3.getElem( row ) );
+}
+
+__forceinline Vector3 & Transform3::operator []( int col )
+{
+    return *(&mCol0 + col);
+}
+
+__forceinline const Vector3 Transform3::operator []( int col ) const
+{
+    return *(&mCol0 + col);
+}
+
+__forceinline Transform3 & Transform3::operator =( const Transform3 & tfrm )
+{
+    mCol0 = tfrm.mCol0;
+    mCol1 = tfrm.mCol1;
+    mCol2 = tfrm.mCol2;
+    mCol3 = tfrm.mCol3;
+    return *this;
+}
+
+__forceinline const Transform3 inverse( const Transform3 & tfrm )
+{
+    __m128 inv0, inv1, inv2, inv3;
+    __m128 tmp0, tmp1, tmp2, tmp3, tmp4, dot, invdet;
+    __m128 xxxx, yyyy, zzzz;
+    tmp2 = _vmathVfCross( tfrm.getCol0().get128(), tfrm.getCol1().get128() );
+    tmp0 = _vmathVfCross( tfrm.getCol1().get128(), tfrm.getCol2().get128() );
+    tmp1 = _vmathVfCross( tfrm.getCol2().get128(), tfrm.getCol0().get128() );
+    inv3 = negatef4( tfrm.getCol3().get128() );
+    dot = _vmathVfDot3( tmp2, tfrm.getCol2().get128() );
+    dot = vec_splat( dot, 0 );
+    invdet = recipf4( dot );
+    tmp3 = vec_mergeh( tmp0, tmp2 );
+    tmp4 = vec_mergel( tmp0, tmp2 );
+    inv0 = vec_mergeh( tmp3, tmp1 );
+    xxxx = vec_splat( inv3, 0 );
+    //inv1 = vec_perm( tmp3, tmp1, _VECTORMATH_PERM_ZBWX );
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	inv1 = _mm_shuffle_ps( tmp3, tmp3, _MM_SHUFFLE(0,3,2,2));
+	inv1 = vec_sel(inv1, tmp1, select_y);
+    //inv2 = vec_perm( tmp4, tmp1, _VECTORMATH_PERM_XCYX );
+	inv2 = _mm_shuffle_ps( tmp4, tmp4, _MM_SHUFFLE(0,1,1,0));
+	inv2 = vec_sel(inv2, vec_splat(tmp1, 2), select_y);
+    yyyy = vec_splat( inv3, 1 );
+    zzzz = vec_splat( inv3, 2 );
+    inv3 = vec_mul( inv0, xxxx );
+    inv3 = vec_madd( inv1, yyyy, inv3 );
+    inv3 = vec_madd( inv2, zzzz, inv3 );
+    inv0 = vec_mul( inv0, invdet );
+    inv1 = vec_mul( inv1, invdet );
+    inv2 = vec_mul( inv2, invdet );
+    inv3 = vec_mul( inv3, invdet );
+    return Transform3(
+        Vector3( inv0 ),
+        Vector3( inv1 ),
+        Vector3( inv2 ),
+        Vector3( inv3 )
+    );
+}
+
+__forceinline const Transform3 orthoInverse( const Transform3 & tfrm )
+{
+    __m128 inv0, inv1, inv2, inv3;
+    __m128 tmp0, tmp1;
+    __m128 xxxx, yyyy, zzzz;
+    tmp0 = vec_mergeh( tfrm.getCol0().get128(), tfrm.getCol2().get128() );
+    tmp1 = vec_mergel( tfrm.getCol0().get128(), tfrm.getCol2().get128() );
+    inv3 = negatef4( tfrm.getCol3().get128() );
+    inv0 = vec_mergeh( tmp0, tfrm.getCol1().get128() );
+    xxxx = vec_splat( inv3, 0 );
+    //inv1 = vec_perm( tmp0, tfrm.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	inv1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
+	inv1 = vec_sel(inv1, tfrm.getCol1().get128(), select_y);
+    //inv2 = vec_perm( tmp1, tfrm.getCol1().get128(), _VECTORMATH_PERM_XCYX );
+	inv2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
+	inv2 = vec_sel(inv2, vec_splat(tfrm.getCol1().get128(), 2), select_y);
+    yyyy = vec_splat( inv3, 1 );
+    zzzz = vec_splat( inv3, 2 );
+    inv3 = vec_mul( inv0, xxxx );
+    inv3 = vec_madd( inv1, yyyy, inv3 );
+    inv3 = vec_madd( inv2, zzzz, inv3 );
+    return Transform3(
+        Vector3( inv0 ),
+        Vector3( inv1 ),
+        Vector3( inv2 ),
+        Vector3( inv3 )
+    );
+}
+
+__forceinline const Transform3 absPerElem( const Transform3 & tfrm )
+{
+    return Transform3(
+        absPerElem( tfrm.getCol0() ),
+        absPerElem( tfrm.getCol1() ),
+        absPerElem( tfrm.getCol2() ),
+        absPerElem( tfrm.getCol3() )
+    );
+}
+
+__forceinline const Vector3 Transform3::operator *( const Vector3 &vec ) const
+{
+    __m128 res;
+    __m128 xxxx, yyyy, zzzz;
+    xxxx = vec_splat( vec.get128(), 0 );
+    yyyy = vec_splat( vec.get128(), 1 );
+    zzzz = vec_splat( vec.get128(), 2 );
+    res = vec_mul( mCol0.get128(), xxxx );
+    res = vec_madd( mCol1.get128(), yyyy, res );
+    res = vec_madd( mCol2.get128(), zzzz, res );
+    return Vector3( res );
+}
+
+__forceinline const Point3 Transform3::operator *( const Point3 &pnt ) const
+{
+    __m128 tmp0, tmp1, res;
+    __m128 xxxx, yyyy, zzzz;
+    xxxx = vec_splat( pnt.get128(), 0 );
+    yyyy = vec_splat( pnt.get128(), 1 );
+    zzzz = vec_splat( pnt.get128(), 2 );
+    tmp0 = vec_mul( mCol0.get128(), xxxx );
+    tmp1 = vec_mul( mCol1.get128(), yyyy );
+    tmp0 = vec_madd( mCol2.get128(), zzzz, tmp0 );
+    tmp1 = vec_add( mCol3.get128(), tmp1 );
+    res = vec_add( tmp0, tmp1 );
+    return Point3( res );
+}
+
+__forceinline const Transform3 Transform3::operator *( const Transform3 & tfrm ) const
+{
+    return Transform3(
+        ( *this * tfrm.mCol0 ),
+        ( *this * tfrm.mCol1 ),
+        ( *this * tfrm.mCol2 ),
+        Vector3( ( *this * Point3( tfrm.mCol3 ) ) )
+    );
+}
+
+__forceinline Transform3 & Transform3::operator *=( const Transform3 & tfrm )
+{
+    *this = *this * tfrm;
+    return *this;
+}
+
+__forceinline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 )
+{
+    return Transform3(
+        mulPerElem( tfrm0.getCol0(), tfrm1.getCol0() ),
+        mulPerElem( tfrm0.getCol1(), tfrm1.getCol1() ),
+        mulPerElem( tfrm0.getCol2(), tfrm1.getCol2() ),
+        mulPerElem( tfrm0.getCol3(), tfrm1.getCol3() )
+    );
+}
+
+__forceinline const Transform3 Transform3::identity( )
+{
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( ),
+        Vector3( 0.0f )
+    );
+}
+
+__forceinline Transform3 & Transform3::setUpper3x3( const Matrix3 & tfrm )
+{
+    mCol0 = tfrm.getCol0();
+    mCol1 = tfrm.getCol1();
+    mCol2 = tfrm.getCol2();
+    return *this;
+}
+
+__forceinline const Matrix3 Transform3::getUpper3x3( ) const
+{
+    return Matrix3( mCol0, mCol1, mCol2 );
+}
+
+__forceinline Transform3 & Transform3::setTranslation( const Vector3 &translateVec )
+{
+    mCol3 = translateVec;
+    return *this;
+}
+
+__forceinline const Vector3 Transform3::getTranslation( ) const
+{
+    return mCol3;
+}
+
+__forceinline const Transform3 Transform3::rotationX( float radians )
+{
+    return rotationX( floatInVec(radians) );
+}
+
+__forceinline const Transform3 Transform3::rotationX( const floatInVec &radians )
+{
+    __m128 s, c, res1, res2;
+    __m128 zero;
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res1 = vec_sel( zero, c, select_y );
+    res1 = vec_sel( res1, s, select_z );
+    res2 = vec_sel( zero, negatef4(s), select_y );
+    res2 = vec_sel( res2, c, select_z );
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3( res1 ),
+        Vector3( res2 ),
+        Vector3( _mm_setzero_ps() )
+    );
+}
+
+__forceinline const Transform3 Transform3::rotationY( float radians )
+{
+    return rotationY( floatInVec(radians) );
+}
+
+__forceinline const Transform3 Transform3::rotationY( const floatInVec &radians )
+{
+    __m128 s, c, res0, res2;
+    __m128 zero;
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, negatef4(s), select_z );
+    res2 = vec_sel( zero, s, select_x );
+    res2 = vec_sel( res2, c, select_z );
+    return Transform3(
+        Vector3( res0 ),
+        Vector3::yAxis( ),
+        Vector3( res2 ),
+        Vector3( 0.0f )
+    );
+}
+
+__forceinline const Transform3 Transform3::rotationZ( float radians )
+{
+    return rotationZ( floatInVec(radians) );
+}
+
+__forceinline const Transform3 Transform3::rotationZ( const floatInVec &radians )
+{
+    __m128 s, c, res0, res1;
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+    __m128 zero = _mm_setzero_ps();
+    sincosf4( radians.get128(), &s, &c );
+    res0 = vec_sel( zero, c, select_x );
+    res0 = vec_sel( res0, s, select_y );
+    res1 = vec_sel( zero, negatef4(s), select_x );
+    res1 = vec_sel( res1, c, select_y );
+    return Transform3(
+        Vector3( res0 ),
+        Vector3( res1 ),
+        Vector3::zAxis( ),
+        Vector3( 0.0f )
+    );
+}
+
+__forceinline const Transform3 Transform3::rotationZYX( const Vector3 &radiansXYZ )
+{
+    __m128 angles, s, negS, c, X0, X1, Y0, Y1, Z0, Z1, tmp;
+    angles = Vector4( radiansXYZ, 0.0f ).get128();
+    sincosf4( angles, &s, &c );
+    negS = negatef4( s );
+    Z0 = vec_mergel( c, s );
+    Z1 = vec_mergel( negS, c );
+	__declspec(align(16)) unsigned int select_xyz[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0};
+    Z1 = vec_and( Z1, _mm_load_ps( (float *)select_xyz ) );
+	Y0 = _mm_shuffle_ps( c, negS, _MM_SHUFFLE(0,1,1,1) );
+	Y1 = _mm_shuffle_ps( s, c, _MM_SHUFFLE(0,1,1,1) );
+    X0 = vec_splat( s, 0 );
+    X1 = vec_splat( c, 0 );
+    tmp = vec_mul( Z0, Y1 );
+    return Transform3(
+        Vector3( vec_mul( Z0, Y0 ) ),
+        Vector3( vec_madd( Z1, X1, vec_mul( tmp, X0 ) ) ),
+        Vector3( vec_nmsub( Z1, X0, vec_mul( tmp, X1 ) ) ),
+        Vector3( 0.0f )
+    );
+}
+
+__forceinline const Transform3 Transform3::rotation( float radians, const Vector3 &unitVec )
+{
+    return rotation( floatInVec(radians), unitVec );
+}
+
+__forceinline const Transform3 Transform3::rotation( const floatInVec &radians, const Vector3 &unitVec )
+{
+    return Transform3( Matrix3::rotation( radians, unitVec ), Vector3( 0.0f ) );
+}
+
+__forceinline const Transform3 Transform3::rotation( const Quat &unitQuat )
+{
+    return Transform3( Matrix3( unitQuat ), Vector3( 0.0f ) );
+}
+
+__forceinline const Transform3 Transform3::scale( const Vector3 &scaleVec )
+{
+    __m128 zero = _mm_setzero_ps();
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    return Transform3(
+        Vector3( vec_sel( zero, scaleVec.get128(), select_x ) ),
+        Vector3( vec_sel( zero, scaleVec.get128(), select_y ) ),
+        Vector3( vec_sel( zero, scaleVec.get128(), select_z ) ),
+        Vector3( 0.0f )
+    );
+}
+
+__forceinline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec )
+{
+    return Transform3(
+        ( tfrm.getCol0() * scaleVec.getX( ) ),
+        ( tfrm.getCol1() * scaleVec.getY( ) ),
+        ( tfrm.getCol2() * scaleVec.getZ( ) ),
+        tfrm.getCol3()
+    );
+}
+
+__forceinline const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm )
+{
+    return Transform3(
+        mulPerElem( tfrm.getCol0(), scaleVec ),
+        mulPerElem( tfrm.getCol1(), scaleVec ),
+        mulPerElem( tfrm.getCol2(), scaleVec ),
+        mulPerElem( tfrm.getCol3(), scaleVec )
+    );
+}
+
+__forceinline const Transform3 Transform3::translation( const Vector3 &translateVec )
+{
+    return Transform3(
+        Vector3::xAxis( ),
+        Vector3::yAxis( ),
+        Vector3::zAxis( ),
+        translateVec
+    );
+}
+
+__forceinline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 )
+{
+    return Transform3(
+        select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
+        select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ),
+        select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ),
+        select( tfrm0.getCol3(), tfrm1.getCol3(), select1 )
+    );
+}
+
+__forceinline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 )
+{
+    return Transform3(
+        select( tfrm0.getCol0(), tfrm1.getCol0(), select1 ),
+        select( tfrm0.getCol1(), tfrm1.getCol1(), select1 ),
+        select( tfrm0.getCol2(), tfrm1.getCol2(), select1 ),
+        select( tfrm0.getCol3(), tfrm1.getCol3(), select1 )
+    );
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+__forceinline void print( const Transform3 & tfrm )
+{
+    print( tfrm.getRow( 0 ) );
+    print( tfrm.getRow( 1 ) );
+    print( tfrm.getRow( 2 ) );
+}
+
+__forceinline void print( const Transform3 & tfrm, const char * name )
+{
+    printf("%s:\n", name);
+    print( tfrm );
+}
+
+#endif
+
+__forceinline Quat::Quat( const Matrix3 & tfrm )
+{
+    __m128 res;
+    __m128 col0, col1, col2;
+    __m128 xx_yy, xx_yy_zz_xx, yy_zz_xx_yy, zz_xx_yy_zz, diagSum, diagDiff;
+    __m128 zy_xz_yx, yz_zx_xy, sum, diff;
+    __m128 radicand, invSqrt, scale;
+    __m128 res0, res1, res2, res3;
+    __m128 xx, yy, zz;
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+	__declspec(align(16)) unsigned int select_w[4] = {0, 0, 0, 0xffffffff};
+
+    col0 = tfrm.getCol0().get128();
+    col1 = tfrm.getCol1().get128();
+    col2 = tfrm.getCol2().get128();
+
+    /* four cases: */
+    /* trace > 0 */
+    /* else */
+    /*    xx largest diagonal element */
+    /*    yy largest diagonal element */
+    /*    zz largest diagonal element */
+
+    /* compute quaternion for each case */
+
+    xx_yy = vec_sel( col0, col1, select_y );
+    //xx_yy_zz_xx = vec_perm( xx_yy, col2, _VECTORMATH_PERM_XYCX );
+    //yy_zz_xx_yy = vec_perm( xx_yy, col2, _VECTORMATH_PERM_YCXY );
+    //zz_xx_yy_zz = vec_perm( xx_yy, col2, _VECTORMATH_PERM_CXYC );
+    xx_yy_zz_xx = _mm_shuffle_ps( xx_yy, xx_yy, _MM_SHUFFLE(0,0,1,0) );
+    xx_yy_zz_xx = vec_sel( xx_yy_zz_xx, col2, select_z ); // TODO: Ck
+    yy_zz_xx_yy = _mm_shuffle_ps( xx_yy_zz_xx, xx_yy_zz_xx, _MM_SHUFFLE(1,0,2,1) );
+    zz_xx_yy_zz = _mm_shuffle_ps( xx_yy_zz_xx, xx_yy_zz_xx, _MM_SHUFFLE(2,1,0,2) );
+
+    diagSum = vec_add( vec_add( xx_yy_zz_xx, yy_zz_xx_yy ), zz_xx_yy_zz );
+    diagDiff = vec_sub( vec_sub( xx_yy_zz_xx, yy_zz_xx_yy ), zz_xx_yy_zz );
+    radicand = vec_add( vec_sel( diagDiff, diagSum, select_w ), _mm_set1_ps(1.0f) );
+ //   invSqrt = rsqrtf4( radicand );
+	invSqrt = newtonrapson_rsqrt4( radicand );
+
+	
+
+    zy_xz_yx = vec_sel( col0, col1, select_z );									// zy_xz_yx = 00 01 12 03
+    //zy_xz_yx = vec_perm( zy_xz_yx, col2, _VECTORMATH_PERM_ZAYX );
+	zy_xz_yx = _mm_shuffle_ps( zy_xz_yx, zy_xz_yx, _MM_SHUFFLE(0,1,2,2) );		// zy_xz_yx = 12 12 01 00
+    zy_xz_yx = vec_sel( zy_xz_yx, vec_splat(col2, 0), select_y );				// zy_xz_yx = 12 20 01 00
+    yz_zx_xy = vec_sel( col0, col1, select_x );									// yz_zx_xy = 10 01 02 03
+    //yz_zx_xy = vec_perm( yz_zx_xy, col2, _VECTORMATH_PERM_BZXX );
+	yz_zx_xy = _mm_shuffle_ps( yz_zx_xy, yz_zx_xy, _MM_SHUFFLE(0,0,2,0) );		// yz_zx_xy = 10 02 10 10
+	yz_zx_xy = vec_sel( yz_zx_xy, vec_splat(col2, 1), select_x );				// yz_zx_xy = 21 02 10 10
+
+    sum = vec_add( zy_xz_yx, yz_zx_xy );
+    diff = vec_sub( zy_xz_yx, yz_zx_xy );
+
+    scale = vec_mul( invSqrt, _mm_set1_ps(0.5f) );
+
+    //res0 = vec_perm( sum, diff, _VECTORMATH_PERM_XZYA );
+	res0 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,1,2,0) );
+	res0 = vec_sel( res0, vec_splat(diff, 0), select_w );  // TODO: Ck
+    //res1 = vec_perm( sum, diff, _VECTORMATH_PERM_ZXXB );
+	res1 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,0,0,2) );
+	res1 = vec_sel( res1, vec_splat(diff, 1), select_w );  // TODO: Ck
+    //res2 = vec_perm( sum, diff, _VECTORMATH_PERM_YXXC );
+	res2 = _mm_shuffle_ps( sum, sum, _MM_SHUFFLE(0,0,0,1) );
+	res2 = vec_sel( res2, vec_splat(diff, 2), select_w );  // TODO: Ck
+    res3 = diff;
+    res0 = vec_sel( res0, radicand, select_x );
+    res1 = vec_sel( res1, radicand, select_y );
+    res2 = vec_sel( res2, radicand, select_z );
+    res3 = vec_sel( res3, radicand, select_w );
+    res0 = vec_mul( res0, vec_splat( scale, 0 ) );
+    res1 = vec_mul( res1, vec_splat( scale, 1 ) );
+    res2 = vec_mul( res2, vec_splat( scale, 2 ) );
+    res3 = vec_mul( res3, vec_splat( scale, 3 ) );
+
+    /* determine case and select answer */
+
+    xx = vec_splat( col0, 0 );
+    yy = vec_splat( col1, 1 );
+    zz = vec_splat( col2, 2 );
+    res = vec_sel( res0, res1, vec_cmpgt( yy, xx ) );
+    res = vec_sel( res, res2, vec_and( vec_cmpgt( zz, xx ), vec_cmpgt( zz, yy ) ) );
+    res = vec_sel( res, res3, vec_cmpgt( vec_splat( diagSum, 0 ), _mm_setzero_ps() ) );
+    mVec128 = res;
+}
+
+__forceinline const Matrix3 outer( const Vector3 &tfrm0, const Vector3 &tfrm1 )
+{
+    return Matrix3(
+        ( tfrm0 * tfrm1.getX( ) ),
+        ( tfrm0 * tfrm1.getY( ) ),
+        ( tfrm0 * tfrm1.getZ( ) )
+    );
+}
+
+__forceinline const Matrix4 outer( const Vector4 &tfrm0, const Vector4 &tfrm1 )
+{
+    return Matrix4(
+        ( tfrm0 * tfrm1.getX( ) ),
+        ( tfrm0 * tfrm1.getY( ) ),
+        ( tfrm0 * tfrm1.getZ( ) ),
+        ( tfrm0 * tfrm1.getW( ) )
+    );
+}
+
+__forceinline const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat )
+{
+    __m128 tmp0, tmp1, mcol0, mcol1, mcol2, res;
+    __m128 xxxx, yyyy, zzzz;
+    tmp0 = vec_mergeh( mat.getCol0().get128(), mat.getCol2().get128() );
+    tmp1 = vec_mergel( mat.getCol0().get128(), mat.getCol2().get128() );
+    xxxx = vec_splat( vec.get128(), 0 );
+    mcol0 = vec_mergeh( tmp0, mat.getCol1().get128() );
+    //mcol1 = vec_perm( tmp0, mat.getCol1().get128(), _VECTORMATH_PERM_ZBWX );
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	mcol1 = _mm_shuffle_ps( tmp0, tmp0, _MM_SHUFFLE(0,3,2,2));
+	mcol1 = vec_sel(mcol1, mat.getCol1().get128(), select_y);
+    //mcol2 = vec_perm( tmp1, mat.getCol1().get128(), _VECTORMATH_PERM_XCYX );
+	mcol2 = _mm_shuffle_ps( tmp1, tmp1, _MM_SHUFFLE(0,1,1,0));
+	mcol2 = vec_sel(mcol2, vec_splat(mat.getCol1().get128(), 2), select_y);
+    yyyy = vec_splat( vec.get128(), 1 );
+    res = vec_mul( mcol0, xxxx );
+    zzzz = vec_splat( vec.get128(), 2 );
+    res = vec_madd( mcol1, yyyy, res );
+    res = vec_madd( mcol2, zzzz, res );
+    return Vector3( res );
+}
+
+__forceinline const Matrix3 crossMatrix( const Vector3 &vec )
+{
+    __m128 neg, res0, res1, res2;
+    neg = negatef4( vec.get128() );
+	__declspec(align(16)) unsigned int select_x[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int select_y[4] = {0, 0xffffffff, 0, 0};
+	__declspec(align(16)) unsigned int select_z[4] = {0, 0, 0xffffffff, 0};
+    //res0 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_XZBX );
+	res0 = _mm_shuffle_ps( vec.get128(), vec.get128(), _MM_SHUFFLE(0,2,2,0) );
+	res0 = vec_sel(res0, vec_splat(neg, 1), select_z);
+    //res1 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_CXXX );
+	res1 = vec_sel(vec_splat(vec.get128(), 0), vec_splat(neg, 2), select_x);
+    //res2 = vec_perm( vec.get128(), neg, _VECTORMATH_PERM_YAXX );
+	res2 = _mm_shuffle_ps( vec.get128(), vec.get128(), _MM_SHUFFLE(0,0,1,1) );
+	res2 = vec_sel(res2, vec_splat(neg, 0), select_y);
+	__declspec(align(16)) unsigned int filter_x[4] = {0, 0xffffffff, 0xffffffff, 0xffffffff};
+	__declspec(align(16)) unsigned int filter_y[4] = {0xffffffff, 0, 0xffffffff, 0xffffffff};
+	__declspec(align(16)) unsigned int filter_z[4] = {0xffffffff, 0xffffffff, 0, 0xffffffff};
+    res0 = vec_and( res0, _mm_load_ps((float *)filter_x ) );
+    res1 = vec_and( res1, _mm_load_ps((float *)filter_y ) );
+    res2 = vec_and( res2, _mm_load_ps((float *)filter_z ) ); // TODO: Use selects?
+    return Matrix3(
+        Vector3( res0 ),
+        Vector3( res1 ),
+        Vector3( res2 )
+    );
+}
+
+__forceinline const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat )
+{
+    return Matrix3( cross( vec, mat.getCol0() ), cross( vec, mat.getCol1() ), cross( vec, mat.getCol2() ) );
+}
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/src/BulletMultiThreaded/vectormath/sse/quat_aos.h b/src/BulletMultiThreaded/vectormath/sse/quat_aos.h
new file mode 100644
index 000000000..3d4822142
--- /dev/null
+++ b/src/BulletMultiThreaded/vectormath/sse/quat_aos.h
@@ -0,0 +1,579 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#ifndef _VECTORMATH_QUAT_AOS_CPP_H
+#define _VECTORMATH_QUAT_AOS_CPP_H
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+__forceinline void Quat::set128(vec_float4 vec)
+{
+    mVec128 = vec;
+}
+
+__forceinline Quat::Quat( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
+{
+	mVec128 = _mm_unpacklo_ps(
+		_mm_unpacklo_ps( _x.get128(), _z.get128() ),
+		_mm_unpacklo_ps( _y.get128(), _w.get128() ) );
+}
+
+__forceinline Quat::Quat( const Vector3 &xyz, float _w )
+{
+    mVec128 = xyz.get128();
+    _vmathVfSetElement(mVec128, _w, 3);
+}
+
+
+
+__forceinline  Quat::Quat(const Quat& quat)
+{
+	mVec128 = quat.get128();
+}
+
+__forceinline Quat::Quat( float _x, float _y, float _z, float _w )
+{
+	mVec128 = _mm_setr_ps(_x, _y, _z, _w);
+}
+
+
+
+
+
+__forceinline Quat::Quat( const Vector3 &xyz, const floatInVec &_w )
+{
+    mVec128 = xyz.get128();
+    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
+}
+
+__forceinline Quat::Quat( const Vector4 &vec )
+{
+    mVec128 = vec.get128();
+}
+
+__forceinline Quat::Quat( float scalar )
+{
+    mVec128 = floatInVec(scalar).get128();
+}
+
+__forceinline Quat::Quat( const floatInVec &scalar )
+{
+    mVec128 = scalar.get128();
+}
+
+__forceinline Quat::Quat( __m128 vf4 )
+{
+    mVec128 = vf4;
+}
+
+__forceinline const Quat Quat::identity( )
+{
+    return Quat( _VECTORMATH_UNIT_0001 );
+}
+
+__forceinline const Quat lerp( float t, const Quat &quat0, const Quat &quat1 )
+{
+    return lerp( floatInVec(t), quat0, quat1 );
+}
+
+__forceinline const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 )
+{
+    return ( quat0 + ( ( quat1 - quat0 ) * t ) );
+}
+
+__forceinline const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 )
+{
+    return slerp( floatInVec(t), unitQuat0, unitQuat1 );
+}
+
+__forceinline const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 )
+{
+    Quat start;
+    vec_float4 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
+    __m128 selectMask;
+    cosAngle = _vmathVfDot4( unitQuat0.get128(), unitQuat1.get128() );
+    selectMask = (__m128)vec_cmpgt( _mm_setzero_ps(), cosAngle );
+    cosAngle = vec_sel( cosAngle, negatef4( cosAngle ), selectMask );
+    start = Quat( vec_sel( unitQuat0.get128(), negatef4( unitQuat0.get128() ), selectMask ) );
+    selectMask = (__m128)vec_cmpgt( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
+    angle = acosf4( cosAngle );
+    tttt = t.get128();
+    oneMinusT = vec_sub( _mm_set1_ps(1.0f), tttt );
+    angles = vec_mergeh( _mm_set1_ps(1.0f), tttt );
+    angles = vec_mergeh( angles, oneMinusT );
+    angles = vec_madd( angles, angle, _mm_setzero_ps() );
+    sines = sinf4( angles );
+    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
+    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
+    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
+    return Quat( vec_madd( start.get128(), scale0, vec_mul( unitQuat1.get128(), scale1 ) ) );
+}
+
+__forceinline const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
+{
+    return squad( floatInVec(t), unitQuat0, unitQuat1, unitQuat2, unitQuat3 );
+}
+
+__forceinline const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 )
+{
+    return slerp( ( ( floatInVec(2.0f) * t ) * ( floatInVec(1.0f) - t ) ), slerp( t, unitQuat0, unitQuat3 ), slerp( t, unitQuat1, unitQuat2 ) );
+}
+
+__forceinline __m128 Quat::get128( ) const
+{
+    return mVec128;
+}
+
+__forceinline Quat & Quat::operator =( const Quat &quat )
+{
+    mVec128 = quat.mVec128;
+    return *this;
+}
+
+__forceinline Quat & Quat::setXYZ( const Vector3 &vec )
+{
+	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+	mVec128 = vec_sel( vec.get128(), mVec128, sw );
+    return *this;
+}
+
+__forceinline const Vector3 Quat::getXYZ( ) const
+{
+    return Vector3( mVec128 );
+}
+
+__forceinline Quat & Quat::setX( float _x )
+{
+    _vmathVfSetElement(mVec128, _x, 0);
+    return *this;
+}
+
+__forceinline Quat & Quat::setX( const floatInVec &_x )
+{
+    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
+    return *this;
+}
+
+__forceinline const floatInVec Quat::getX( ) const
+{
+    return floatInVec( mVec128, 0 );
+}
+
+__forceinline Quat & Quat::setY( float _y )
+{
+    _vmathVfSetElement(mVec128, _y, 1);
+    return *this;
+}
+
+__forceinline Quat & Quat::setY( const floatInVec &_y )
+{
+    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
+    return *this;
+}
+
+__forceinline const floatInVec Quat::getY( ) const
+{
+    return floatInVec( mVec128, 1 );
+}
+
+__forceinline Quat & Quat::setZ( float _z )
+{
+    _vmathVfSetElement(mVec128, _z, 2);
+    return *this;
+}
+
+__forceinline Quat & Quat::setZ( const floatInVec &_z )
+{
+    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
+    return *this;
+}
+
+__forceinline const floatInVec Quat::getZ( ) const
+{
+    return floatInVec( mVec128, 2 );
+}
+
+__forceinline Quat & Quat::setW( float _w )
+{
+    _vmathVfSetElement(mVec128, _w, 3);
+    return *this;
+}
+
+__forceinline Quat & Quat::setW( const floatInVec &_w )
+{
+    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
+    return *this;
+}
+
+__forceinline const floatInVec Quat::getW( ) const
+{
+    return floatInVec( mVec128, 3 );
+}
+
+__forceinline Quat & Quat::setElem( int idx, float value )
+{
+    _vmathVfSetElement(mVec128, value, idx);
+    return *this;
+}
+
+__forceinline Quat & Quat::setElem( int idx, const floatInVec &value )
+{
+    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
+    return *this;
+}
+
+__forceinline const floatInVec Quat::getElem( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+__forceinline VecIdx Quat::operator []( int idx )
+{
+    return VecIdx( mVec128, idx );
+}
+
+__forceinline const floatInVec Quat::operator []( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+__forceinline const Quat Quat::operator +( const Quat &quat ) const
+{
+    return Quat( _mm_add_ps( mVec128, quat.mVec128 ) );
+}
+
+
+__forceinline const Quat Quat::operator -( const Quat &quat ) const
+{
+    return Quat( _mm_sub_ps( mVec128, quat.mVec128 ) );
+}
+
+__forceinline const Quat Quat::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+__forceinline const Quat Quat::operator *( const floatInVec &scalar ) const
+{
+    return Quat( _mm_mul_ps( mVec128, scalar.get128() ) );
+}
+
+__forceinline Quat & Quat::operator +=( const Quat &quat )
+{
+    *this = *this + quat;
+    return *this;
+}
+
+__forceinline Quat & Quat::operator -=( const Quat &quat )
+{
+    *this = *this - quat;
+    return *this;
+}
+
+__forceinline Quat & Quat::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+__forceinline Quat & Quat::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+__forceinline const Quat Quat::operator /( float scalar ) const
+{
+    return *this / floatInVec(scalar);
+}
+
+__forceinline const Quat Quat::operator /( const floatInVec &scalar ) const
+{
+    return Quat( _mm_div_ps( mVec128, scalar.get128() ) );
+}
+
+__forceinline Quat & Quat::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+__forceinline Quat & Quat::operator /=( const floatInVec &scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+__forceinline const Quat Quat::operator -( ) const
+{
+	return Quat(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
+}
+
+__forceinline const Quat operator *( float scalar, const Quat &quat )
+{
+    return floatInVec(scalar) * quat;
+}
+
+__forceinline const Quat operator *( const floatInVec &scalar, const Quat &quat )
+{
+    return quat * scalar;
+}
+
+__forceinline const floatInVec dot( const Quat &quat0, const Quat &quat1 )
+{
+    return floatInVec( _vmathVfDot4( quat0.get128(), quat1.get128() ), 0 );
+}
+
+__forceinline const floatInVec norm( const Quat &quat )
+{
+    return floatInVec(  _vmathVfDot4( quat.get128(), quat.get128() ), 0 );
+}
+
+__forceinline const floatInVec length( const Quat &quat )
+{
+    return floatInVec(  _mm_sqrt_ps(_vmathVfDot4( quat.get128(), quat.get128() )), 0 );
+}
+
+__forceinline const Quat normalize( const Quat &quat )
+{
+	vec_float4 dot =_vmathVfDot4( quat.get128(), quat.get128());
+    return Quat( _mm_mul_ps( quat.get128(), newtonrapson_rsqrt4( dot ) ) );
+}
+
+
+__forceinline const Quat Quat::rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 )
+{
+    Vector3 crossVec;
+    __m128 cosAngle, cosAngleX2Plus2, recipCosHalfAngleX2, cosHalfAngleX2, res;
+    cosAngle = _vmathVfDot3( unitVec0.get128(), unitVec1.get128() );
+    cosAngleX2Plus2 = vec_madd( cosAngle, _mm_set1_ps(2.0f), _mm_set1_ps(2.0f) );
+    recipCosHalfAngleX2 = _mm_rsqrt_ps( cosAngleX2Plus2 );
+    cosHalfAngleX2 = vec_mul( recipCosHalfAngleX2, cosAngleX2Plus2 );
+    crossVec = cross( unitVec0, unitVec1 );
+    res = vec_mul( crossVec.get128(), recipCosHalfAngleX2 );
+	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( res, vec_mul( cosHalfAngleX2, _mm_set1_ps(0.5f) ), sw );
+    return Quat( res );
+}
+
+__forceinline const Quat Quat::rotation( float radians, const Vector3 &unitVec )
+{
+    return rotation( floatInVec(radians), unitVec );
+}
+
+__forceinline const Quat Quat::rotation( const floatInVec &radians, const Vector3 &unitVec )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( vec_mul( unitVec.get128(), s ), c, sw );
+    return Quat( res );
+}
+
+__forceinline const Quat Quat::rotationX( float radians )
+{
+    return rotationX( floatInVec(radians) );
+}
+
+__forceinline const Quat Quat::rotationX( const floatInVec &radians )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	__declspec(align(16)) unsigned int xsw[4] = {0xffffffff, 0, 0, 0};
+	__declspec(align(16)) unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( _mm_setzero_ps(), s, xsw );
+    res = vec_sel( res, c, wsw );
+    return Quat( res );
+}
+
+__forceinline const Quat Quat::rotationY( float radians )
+{
+    return rotationY( floatInVec(radians) );
+}
+
+__forceinline const Quat Quat::rotationY( const floatInVec &radians )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	__declspec(align(16)) unsigned int ysw[4] = {0, 0xffffffff, 0, 0};
+	__declspec(align(16)) unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( _mm_setzero_ps(), s, ysw );
+    res = vec_sel( res, c, wsw );
+    return Quat( res );
+}
+
+__forceinline const Quat Quat::rotationZ( float radians )
+{
+    return rotationZ( floatInVec(radians) );
+}
+
+__forceinline const Quat Quat::rotationZ( const floatInVec &radians )
+{
+    __m128 s, c, angle, res;
+    angle = vec_mul( radians.get128(), _mm_set1_ps(0.5f) );
+    sincosf4( angle, &s, &c );
+	__declspec(align(16)) unsigned int zsw[4] = {0, 0, 0xffffffff, 0};
+	__declspec(align(16)) unsigned int wsw[4] = {0, 0, 0, 0xffffffff};
+    res = vec_sel( _mm_setzero_ps(), s, zsw );
+    res = vec_sel( res, c, wsw );
+    return Quat( res );
+}
+
+__forceinline const Quat Quat::operator *( const Quat &quat ) const
+{
+    __m128 ldata, rdata, qv, tmp0, tmp1, tmp2, tmp3;
+    __m128 product, l_wxyz, r_wxyz, xy, qw;
+    ldata = mVec128;
+    rdata = quat.mVec128;
+    tmp0 = _mm_shuffle_ps( ldata, ldata, _MM_SHUFFLE(3,0,2,1) );
+    tmp1 = _mm_shuffle_ps( rdata, rdata, _MM_SHUFFLE(3,1,0,2) );
+    tmp2 = _mm_shuffle_ps( ldata, ldata, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( rdata, rdata, _MM_SHUFFLE(3,0,2,1) );
+    qv = vec_mul( vec_splat( ldata, 3 ), rdata );
+    qv = vec_madd( vec_splat( rdata, 3 ), ldata, qv );
+    qv = vec_madd( tmp0, tmp1, qv );
+    qv = vec_nmsub( tmp2, tmp3, qv );
+    product = vec_mul( ldata, rdata );
+    l_wxyz = vec_sld( ldata, ldata, 12 );
+    r_wxyz = vec_sld( rdata, rdata, 12 );
+    qw = vec_nmsub( l_wxyz, r_wxyz, product );
+    xy = vec_madd( l_wxyz, r_wxyz, product );
+    qw = vec_sub( qw, vec_sld( xy, xy, 8 ) );
+	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+    return Quat( vec_sel( qv, qw, sw ) );
+}
+
+__forceinline Quat & Quat::operator *=( const Quat &quat )
+{
+    *this = *this * quat;
+    return *this;
+}
+
+__forceinline const Vector3 rotate( const Quat &quat, const Vector3 &vec )
+{    __m128 qdata, vdata, product, tmp0, tmp1, tmp2, tmp3, wwww, qv, qw, res;
+    qdata = quat.get128();
+    vdata = vec.get128();
+    tmp0 = _mm_shuffle_ps( qdata, qdata, _MM_SHUFFLE(3,0,2,1) );
+    tmp1 = _mm_shuffle_ps( vdata, vdata, _MM_SHUFFLE(3,1,0,2) );
+    tmp2 = _mm_shuffle_ps( qdata, qdata, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( vdata, vdata, _MM_SHUFFLE(3,0,2,1) );
+    wwww = vec_splat( qdata, 3 );
+    qv = vec_mul( wwww, vdata );
+    qv = vec_madd( tmp0, tmp1, qv );
+    qv = vec_nmsub( tmp2, tmp3, qv );
+    product = vec_mul( qdata, vdata );
+    qw = vec_madd( vec_sld( qdata, qdata, 4 ), vec_sld( vdata, vdata, 4 ), product );
+    qw = vec_add( vec_sld( product, product, 8 ), qw );
+    tmp1 = _mm_shuffle_ps( qv, qv, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( qv, qv, _MM_SHUFFLE(3,0,2,1) );
+    res = vec_mul( vec_splat( qw, 0 ), qdata );
+    res = vec_madd( wwww, qv, res );
+    res = vec_madd( tmp0, tmp1, res );
+    res = vec_nmsub( tmp2, tmp3, res );
+    return Vector3( res );
+}
+
+__forceinline const Quat conj( const Quat &quat )
+{
+	__declspec(align(16)) unsigned int sw[4] = {0x80000000,0x80000000,0x80000000,0};
+    return Quat( vec_xor( quat.get128(), _mm_load_ps((float *)sw) ) );
+}
+
+__forceinline const Quat select( const Quat &quat0, const Quat &quat1, bool select1 )
+{
+    return select( quat0, quat1, boolInVec(select1) );
+}
+
+//__forceinline const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 )
+//{
+//    return Quat( vec_sel( quat0.get128(), quat1.get128(), select1.get128() ) );
+//}
+
+__forceinline void loadXYZW(Quat& quat, const float* fptr)
+{
+#ifdef USE_SSE2_LDDQU
+	quat = Quat(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128		);
+#else
+	SSEFloat fl;
+	fl.f[0] = fptr[0];
+	fl.f[1] = fptr[1];
+	fl.f[2] = fptr[2];
+	fl.f[3] = fptr[3];
+    quat = Quat(	fl.m128);
+#endif
+    
+
+}
+
+__forceinline void storeXYZW(const Quat& quat, float* fptr)
+{
+	fptr[0] = quat.getX();
+	fptr[1] = quat.getY();
+	fptr[2] = quat.getZ();
+	fptr[3] = quat.getW();
+//    _mm_storeu_ps((float*)quat.get128(),fptr);
+}
+
+
+
+#ifdef _VECTORMATH_DEBUG
+
+__forceinline void print( const Quat &quat )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = quat.get128();
+    printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
+}
+
+__forceinline void print( const Quat &quat, const char * name )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = quat.get128();
+    printf( "%s: ( %f %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/src/BulletMultiThreaded/vectormath/sse/vec_aos.h b/src/BulletMultiThreaded/vectormath/sse/vec_aos.h
new file mode 100644
index 000000000..1a6e02eae
--- /dev/null
+++ b/src/BulletMultiThreaded/vectormath/sse/vec_aos.h
@@ -0,0 +1,1431 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _VECTORMATH_VEC_AOS_CPP_H
+#define _VECTORMATH_VEC_AOS_CPP_H
+
+//-----------------------------------------------------------------------------
+// Constants
+// for permutes words are labeled [x,y,z,w] [a,b,c,d]
+
+#define _VECTORMATH_PERM_X 0x00010203
+#define _VECTORMATH_PERM_Y 0x04050607
+#define _VECTORMATH_PERM_Z 0x08090a0b
+#define _VECTORMATH_PERM_W 0x0c0d0e0f
+#define _VECTORMATH_PERM_A 0x10111213
+#define _VECTORMATH_PERM_B 0x14151617
+#define _VECTORMATH_PERM_C 0x18191a1b
+#define _VECTORMATH_PERM_D 0x1c1d1e1f
+#define _VECTORMATH_PERM_XYZA (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A }
+#define _VECTORMATH_PERM_ZXYW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_W }
+#define _VECTORMATH_PERM_YZXW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_X, _VECTORMATH_PERM_W }
+#define _VECTORMATH_PERM_YZAB (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Y, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B }
+#define _VECTORMATH_PERM_ZABC (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_Z, _VECTORMATH_PERM_A, _VECTORMATH_PERM_B, _VECTORMATH_PERM_C }
+#define _VECTORMATH_PERM_XYAW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_Y, _VECTORMATH_PERM_A, _VECTORMATH_PERM_W }
+#define _VECTORMATH_PERM_XAZW (vec_uchar16)(vec_uint4){ _VECTORMATH_PERM_X, _VECTORMATH_PERM_A, _VECTORMATH_PERM_Z, _VECTORMATH_PERM_W }
+#define _VECTORMATH_MASK_0xF000 (vec_uint4){ 0xffffffff, 0, 0, 0 }
+#define _VECTORMATH_MASK_0x0F00 (vec_uint4){ 0, 0xffffffff, 0, 0 }
+#define _VECTORMATH_MASK_0x00F0 (vec_uint4){ 0, 0, 0xffffffff, 0 }
+#define _VECTORMATH_MASK_0x000F (vec_uint4){ 0, 0, 0, 0xffffffff }
+#define _VECTORMATH_UNIT_1000 _mm_setr_ps(1.0f,0.0f,0.0f,0.0f) // (__m128){ 1.0f, 0.0f, 0.0f, 0.0f }
+#define _VECTORMATH_UNIT_0100 _mm_setr_ps(0.0f,1.0f,0.0f,0.0f) // (__m128){ 0.0f, 1.0f, 0.0f, 0.0f }
+#define _VECTORMATH_UNIT_0010 _mm_setr_ps(0.0f,0.0f,1.0f,0.0f) // (__m128){ 0.0f, 0.0f, 1.0f, 0.0f }
+#define _VECTORMATH_UNIT_0001 _mm_setr_ps(0.0f,0.0f,0.0f,1.0f) // (__m128){ 0.0f, 0.0f, 0.0f, 1.0f }
+#define _VECTORMATH_SLERP_TOL 0.999f
+//_VECTORMATH_SLERP_TOLF
+
+//-----------------------------------------------------------------------------
+// Definitions
+
+#ifndef _VECTORMATH_INTERNAL_FUNCTIONS
+#define _VECTORMATH_INTERNAL_FUNCTIONS
+
+#define     _vmath_shufps(a, b, immx, immy, immz, immw) _mm_shuffle_ps(a, b, _MM_SHUFFLE(immw, immz, immy, immx))
+static __forceinline __m128 _vmathVfDot3( __m128 vec0, __m128 vec1 )
+{
+	__m128 result = _mm_mul_ps( vec0, vec1);
+    return _mm_add_ps( vec_splat( result, 0 ), _mm_add_ps( vec_splat( result, 1 ), vec_splat( result, 2 ) ) );
+}
+
+static __forceinline __m128 _vmathVfDot4( __m128 vec0, __m128 vec1 )
+{
+    __m128 result = _mm_mul_ps(vec0, vec1);
+	return _mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(0,0,0,0)),
+			_mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(1,1,1,1)),
+			_mm_add_ps(_mm_shuffle_ps(result, result, _MM_SHUFFLE(2,2,2,2)), _mm_shuffle_ps(result, result, _MM_SHUFFLE(3,3,3,3)))));
+}
+
+static __forceinline __m128 _vmathVfCross( __m128 vec0, __m128 vec1 )
+{
+    __m128 tmp0, tmp1, tmp2, tmp3, result;
+    tmp0 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,0,2,1) );
+    tmp1 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE(3,1,0,2) );
+    tmp2 = _mm_shuffle_ps( vec0, vec0, _MM_SHUFFLE(3,1,0,2) );
+    tmp3 = _mm_shuffle_ps( vec1, vec1, _MM_SHUFFLE(3,0,2,1) );
+    result = vec_mul( tmp0, tmp1 );
+    result = vec_nmsub( tmp2, tmp3, result );
+    return result;
+}
+/*
+static __forceinline vec_uint4 _vmathVfToHalfFloatsUnpacked(__m128 v)
+{
+#if 0
+    vec_int4 bexp;
+    vec_uint4 mant, sign, hfloat;
+    vec_uint4 notZero, isInf;
+    const vec_uint4 hfloatInf = (vec_uint4)(0x00007c00u);
+    const vec_uint4 mergeMant = (vec_uint4)(0x000003ffu);
+    const vec_uint4 mergeSign = (vec_uint4)(0x00008000u);
+
+    sign = vec_sr((vec_uint4)v, (vec_uint4)16);
+    mant = vec_sr((vec_uint4)v, (vec_uint4)13);
+    bexp = vec_and(vec_sr((vec_int4)v, (vec_uint4)23), (vec_int4)0xff);
+
+    notZero = (vec_uint4)vec_cmpgt(bexp, (vec_int4)112);
+    isInf = (vec_uint4)vec_cmpgt(bexp, (vec_int4)142);
+
+    bexp = _mm_add_ps(bexp, (vec_int4)-112);
+    bexp = vec_sl(bexp, (vec_uint4)10);
+
+    hfloat = vec_sel((vec_uint4)bexp, mant, mergeMant);
+    hfloat = vec_sel((vec_uint4)(0), hfloat, notZero);
+    hfloat = vec_sel(hfloat, hfloatInf, isInf);
+    hfloat = vec_sel(hfloat, sign, mergeSign);
+
+    return hfloat;
+#else
+	assert(0);
+	return _mm_setzero_ps();
+#endif
+}
+
+static __forceinline vec_ushort8 _vmath2VfToHalfFloats(__m128 u, __m128 v)
+{
+#if 0
+    vec_uint4 hfloat_u, hfloat_v;
+    const vec_uchar16 pack = (vec_uchar16){2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+    hfloat_u = _vmathVfToHalfFloatsUnpacked(u);
+    hfloat_v = _vmathVfToHalfFloatsUnpacked(v);
+    return (vec_ushort8)vec_perm(hfloat_u, hfloat_v, pack);
+#else
+	assert(0);
+	return _mm_setzero_si128();
+#endif
+}
+*/
+
+static __forceinline __m128 _vmathVfInsert(__m128 dst, __m128 src, int slot)
+{
+	SSEFloat s;
+	s.m128 = src;
+	SSEFloat d;
+	d.m128 = dst;
+	d.f[slot] = s.f[slot];
+	return d.m128;
+}
+
+#define _vmathVfSetElement(vec, scalar, slot) ((float *)&(vec))[slot] = scalar
+
+static __forceinline __m128 _vmathVfSplatScalar(float scalar)
+{
+	return _mm_set1_ps(scalar);
+}
+
+#endif
+
+namespace Vectormath {
+namespace Aos {
+
+	
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+__forceinline VecIdx::operator floatInVec() const
+{
+    return floatInVec(ref, i);
+}
+
+__forceinline float VecIdx::getAsFloat() const
+#else
+__forceinline VecIdx::operator float() const
+#endif
+{
+    return ((float *)&ref)[i];
+}
+
+__forceinline float VecIdx::operator =( float scalar )
+{
+    _vmathVfSetElement(ref, scalar, i);
+    return scalar;
+}
+
+__forceinline floatInVec VecIdx::operator =( const floatInVec &scalar )
+{
+    ref = _vmathVfInsert(ref, scalar.get128(), i);
+    return scalar;
+}
+
+__forceinline floatInVec VecIdx::operator =( const VecIdx& scalar )
+{
+    return *this = floatInVec(scalar.ref, scalar.i);
+}
+
+__forceinline floatInVec VecIdx::operator *=( float scalar )
+{
+    return *this *= floatInVec(scalar);
+}
+
+__forceinline floatInVec VecIdx::operator *=( const floatInVec &scalar )
+{
+    return *this = floatInVec(ref, i) * scalar;
+}
+
+__forceinline floatInVec VecIdx::operator /=( float scalar )
+{
+    return *this /= floatInVec(scalar);
+}
+
+inline floatInVec VecIdx::operator /=( const floatInVec &scalar )
+{
+    return *this = floatInVec(ref, i) / scalar;
+}
+
+__forceinline floatInVec VecIdx::operator +=( float scalar )
+{
+    return *this += floatInVec(scalar);
+}
+
+__forceinline floatInVec VecIdx::operator +=( const floatInVec &scalar )
+{
+    return *this = floatInVec(ref, i) + scalar;
+}
+
+__forceinline floatInVec VecIdx::operator -=( float scalar )
+{
+    return *this -= floatInVec(scalar);
+}
+
+__forceinline floatInVec VecIdx::operator -=( const floatInVec &scalar )
+{
+    return *this = floatInVec(ref, i) - scalar;
+}
+
+__forceinline Vector3::Vector3(const Vector3& vec)
+{
+    set128(vec.get128());
+}
+
+__forceinline void Vector3::set128(vec_float4 vec)
+{
+    mVec128 = vec;
+}
+
+
+__forceinline Vector3::Vector3( float _x, float _y, float _z )
+{
+    mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f);
+}
+
+__forceinline Vector3::Vector3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z )
+{
+	__m128 xz = _mm_unpacklo_ps( _x.get128(), _z.get128() );
+	mVec128 = _mm_unpacklo_ps( xz, _y.get128() );
+}
+
+__forceinline Vector3::Vector3( const Point3 &pnt )
+{
+    mVec128 = pnt.get128();
+}
+
+__forceinline Vector3::Vector3( float scalar )
+{
+    mVec128 = floatInVec(scalar).get128();
+}
+
+__forceinline Vector3::Vector3( const floatInVec &scalar )
+{
+    mVec128 = scalar.get128();
+}
+
+__forceinline Vector3::Vector3( __m128 vf4 )
+{
+    mVec128 = vf4;
+}
+
+__forceinline const Vector3 Vector3::xAxis( )
+{
+    return Vector3( _VECTORMATH_UNIT_1000 );
+}
+
+__forceinline const Vector3 Vector3::yAxis( )
+{
+    return Vector3( _VECTORMATH_UNIT_0100 );
+}
+
+__forceinline const Vector3 Vector3::zAxis( )
+{
+    return Vector3( _VECTORMATH_UNIT_0010 );
+}
+
+__forceinline const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return lerp( floatInVec(t), vec0, vec1 );
+}
+
+__forceinline const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
+}
+
+__forceinline const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 )
+{
+    return slerp( floatInVec(t), unitVec0, unitVec1 );
+}
+
+__forceinline const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 )
+{
+    __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
+    cosAngle = _vmathVfDot3( unitVec0.get128(), unitVec1.get128() );
+    __m128 selectMask = _mm_cmpgt_ps( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
+    angle = acosf4( cosAngle );
+    tttt = t.get128();
+    oneMinusT = _mm_sub_ps( _mm_set1_ps(1.0f), tttt );
+    angles = _mm_unpacklo_ps( _mm_set1_ps(1.0f), tttt ); // angles = 1, t, 1, t
+    angles = _mm_unpacklo_ps( angles, oneMinusT );		// angles = 1, 1-t, t, 1-t
+    angles = _mm_mul_ps( angles, angle );
+    sines = sinf4( angles );
+    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
+    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
+    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
+    return Vector3( vec_madd( unitVec0.get128(), scale0, _mm_mul_ps( unitVec1.get128(), scale1 ) ) );
+}
+
+__forceinline __m128 Vector3::get128( ) const
+{
+    return mVec128;
+}
+
+__forceinline void loadXYZ(Vector3& vec, const float* fptr)
+{
+#ifdef USE_SSE2_LDDQU
+	vec = Vector3(	SSEFloat(_mm_lddqu_si128((const __m128i*)((float*)(fptr)))).m128 );
+#else
+	SSEFloat fl;
+	fl.f[0] = fptr[0];
+	fl.f[1] = fptr[1];
+	fl.f[2] = fptr[2];
+	fl.f[3] = fptr[3];
+    vec = Vector3(	fl.m128);
+#endif //USE_SSE2_LDDQU
+	
+}
+
+__forceinline void storeXYZ( const Vector3 &vec, __m128 * quad )
+{
+    __m128 dstVec = *quad;
+	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff}; // TODO: Centralize
+    dstVec = vec_sel(vec.get128(), dstVec, sw);
+    *quad = dstVec;
+}
+
+__forceinline void storeXYZ(const Vector3& vec, float* fptr)
+{
+	fptr[0] = vec.getX();
+	fptr[1] = vec.getY();
+	fptr[2] = vec.getZ();
+}
+
+
+__forceinline void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads )
+{
+	const float *quads = (float *)threeQuads;
+    vec0 = Vector3(  _mm_load_ps(quads) );
+    vec1 = Vector3( _mm_loadu_ps(quads + 3) );
+    vec2 = Vector3( _mm_loadu_ps(quads + 6) );
+    vec3 = Vector3( _mm_loadu_ps(quads + 9) );
+}
+
+__forceinline void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads )
+{
+	__m128 xxxx = _mm_shuffle_ps( vec1.get128(), vec1.get128(), _MM_SHUFFLE(0, 0, 0, 0) );
+	__m128 zzzz = _mm_shuffle_ps( vec2.get128(), vec2.get128(), _MM_SHUFFLE(2, 2, 2, 2) );
+	__declspec(align(16)) unsigned int xsw[4] = {0, 0, 0, 0xffffffff};
+	__declspec(align(16)) unsigned int zsw[4] = {0xffffffff, 0, 0, 0};
+	threeQuads[0] = vec_sel( vec0.get128(), xxxx, xsw );
+    threeQuads[1] = _mm_shuffle_ps( vec1.get128(), vec2.get128(), _MM_SHUFFLE(1, 0, 2, 1) );
+    threeQuads[2] = vec_sel( _mm_shuffle_ps( vec3.get128(), vec3.get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw );
+}
+/*
+__forceinline void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads )
+{
+	assert(0);
+#if 0
+    __m128 xyz0[3];
+    __m128 xyz1[3];
+    storeXYZArray( vec0, vec1, vec2, vec3, xyz0 );
+    storeXYZArray( vec4, vec5, vec6, vec7, xyz1 );
+    threeQuads[0] = _vmath2VfToHalfFloats(xyz0[0], xyz0[1]);
+    threeQuads[1] = _vmath2VfToHalfFloats(xyz0[2], xyz1[0]);
+    threeQuads[2] = _vmath2VfToHalfFloats(xyz1[1], xyz1[2]);
+#endif
+}
+*/
+__forceinline Vector3 & Vector3::operator =( const Vector3 &vec )
+{
+    mVec128 = vec.mVec128;
+    return *this;
+}
+
+__forceinline Vector3 & Vector3::setX( float _x )
+{
+    _vmathVfSetElement(mVec128, _x, 0);
+    return *this;
+}
+
+__forceinline Vector3 & Vector3::setX( const floatInVec &_x )
+{
+    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
+    return *this;
+}
+
+__forceinline const floatInVec Vector3::getX( ) const
+{
+    return floatInVec( mVec128, 0 );
+}
+
+__forceinline Vector3 & Vector3::setY( float _y )
+{
+    _vmathVfSetElement(mVec128, _y, 1);
+    return *this;
+}
+
+__forceinline Vector3 & Vector3::setY( const floatInVec &_y )
+{
+    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
+    return *this;
+}
+
+__forceinline const floatInVec Vector3::getY( ) const
+{
+    return floatInVec( mVec128, 1 );
+}
+
+__forceinline Vector3 & Vector3::setZ( float _z )
+{
+    _vmathVfSetElement(mVec128, _z, 2);
+    return *this;
+}
+
+__forceinline Vector3 & Vector3::setZ( const floatInVec &_z )
+{
+    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
+    return *this;
+}
+
+__forceinline const floatInVec Vector3::getZ( ) const
+{
+    return floatInVec( mVec128, 2 );
+}
+
+__forceinline Vector3 & Vector3::setElem( int idx, float value )
+{
+    _vmathVfSetElement(mVec128, value, idx);
+    return *this;
+}
+
+__forceinline Vector3 & Vector3::setElem( int idx, const floatInVec &value )
+{
+    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
+    return *this;
+}
+
+__forceinline const floatInVec Vector3::getElem( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+__forceinline VecIdx Vector3::operator []( int idx )
+{
+    return VecIdx( mVec128, idx );
+}
+
+__forceinline const floatInVec Vector3::operator []( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+__forceinline const Vector3 Vector3::operator +( const Vector3 &vec ) const
+{
+    return Vector3( _mm_add_ps( mVec128, vec.mVec128 ) );
+}
+
+__forceinline const Vector3 Vector3::operator -( const Vector3 &vec ) const
+{
+    return Vector3( _mm_sub_ps( mVec128, vec.mVec128 ) );
+}
+
+__forceinline const Point3 Vector3::operator +( const Point3 &pnt ) const
+{
+    return Point3( _mm_add_ps( mVec128, pnt.get128() ) );
+}
+
+__forceinline const Vector3 Vector3::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+__forceinline const Vector3 Vector3::operator *( const floatInVec &scalar ) const
+{
+    return Vector3( _mm_mul_ps( mVec128, scalar.get128() ) );
+}
+
+__forceinline Vector3 & Vector3::operator +=( const Vector3 &vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+__forceinline Vector3 & Vector3::operator -=( const Vector3 &vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+__forceinline Vector3 & Vector3::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+__forceinline Vector3 & Vector3::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+__forceinline const Vector3 Vector3::operator /( float scalar ) const
+{
+    return *this / floatInVec(scalar);
+}
+
+__forceinline const Vector3 Vector3::operator /( const floatInVec &scalar ) const
+{
+    return Vector3( _mm_div_ps( mVec128, scalar.get128() ) );
+}
+
+__forceinline Vector3 & Vector3::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+__forceinline Vector3 & Vector3::operator /=( const floatInVec &scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+__forceinline const Vector3 Vector3::operator -( ) const
+{
+	//return Vector3(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
+
+	__declspec(align(16)) static const int array[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+	__m128 NEG_MASK = SSEFloat(*(const vec_float4*)array).vf;
+	return Vector3(_mm_xor_ps(get128(),NEG_MASK));
+}
+
+__forceinline const Vector3 operator *( float scalar, const Vector3 &vec )
+{
+    return floatInVec(scalar) * vec;
+}
+
+__forceinline const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec )
+{
+    return vec * scalar;
+}
+
+__forceinline const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _mm_mul_ps( vec0.get128(), vec1.get128() ) );
+}
+
+__forceinline const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _mm_div_ps( vec0.get128(), vec1.get128() ) );
+}
+
+__forceinline const Vector3 recipPerElem( const Vector3 &vec )
+{
+    return Vector3( _mm_rcp_ps( vec.get128() ) );
+}
+
+__forceinline const Vector3 absPerElem( const Vector3 &vec )
+{
+    return Vector3( fabsf4( vec.get128() ) );
+}
+
+__forceinline const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+	__m128 vmask = toM128(0x7fffffff);
+	return Vector3( _mm_or_ps(
+		_mm_and_ps   ( vmask, vec0.get128() ),			// Value
+		_mm_andnot_ps( vmask, vec1.get128() ) ) );		// Signs
+}
+
+__forceinline const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _mm_max_ps( vec0.get128(), vec1.get128() ) );
+}
+
+__forceinline const floatInVec maxElem( const Vector3 &vec )
+{
+    return floatInVec( _mm_max_ps( _mm_max_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
+}
+
+__forceinline const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _mm_min_ps( vec0.get128(), vec1.get128() ) );
+}
+
+__forceinline const floatInVec minElem( const Vector3 &vec )
+{
+    return floatInVec( _mm_min_ps( _mm_min_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
+}
+
+__forceinline const floatInVec sum( const Vector3 &vec )
+{
+    return floatInVec( _mm_add_ps( _mm_add_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ), vec_splat( vec.get128(), 2 ) ) );
+}
+
+__forceinline const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return floatInVec( _vmathVfDot3( vec0.get128(), vec1.get128() ), 0 );
+}
+
+__forceinline const floatInVec lengthSqr( const Vector3 &vec )
+{
+    return floatInVec(  _vmathVfDot3( vec.get128(), vec.get128() ), 0 );
+}
+
+__forceinline const floatInVec length( const Vector3 &vec )
+{
+    return floatInVec(  _mm_sqrt_ps(_vmathVfDot3( vec.get128(), vec.get128() )), 0 );
+}
+
+
+__forceinline const Vector3 normalizeApprox( const Vector3 &vec )
+{
+    return Vector3( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) );
+}
+
+__forceinline const Vector3 normalize( const Vector3 &vec )
+{
+	return Vector3( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot3( vec.get128(), vec.get128() ) ) ) );
+}
+
+__forceinline const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 )
+{
+    return Vector3( _vmathVfCross( vec0.get128(), vec1.get128() ) );
+}
+
+__forceinline const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 )
+{
+    return select( vec0, vec1, boolInVec(select1) );
+}
+
+
+__forceinline  const Vector4 select(const Vector4& vec0, const Vector4& vec1, const boolInVec& select1)
+{
+    return Vector4(vec_sel(vec0.get128(), vec1.get128(), select1.get128()));
+}
+
+#ifdef _VECTORMATH_DEBUG
+
+__forceinline void print( const Vector3 &vec )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = vec.get128();
+    printf( "( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] );
+}
+
+__forceinline void print( const Vector3 &vec, const char * name )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = vec.get128();
+    printf( "%s: ( %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2] );
+}
+
+#endif
+
+__forceinline Vector4::Vector4( float _x, float _y, float _z, float _w )
+{
+    mVec128 = _mm_setr_ps(_x, _y, _z, _w); 
+ }
+
+__forceinline Vector4::Vector4( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z, const floatInVec &_w )
+{
+	mVec128 = _mm_unpacklo_ps(
+		_mm_unpacklo_ps( _x.get128(), _z.get128() ),
+		_mm_unpacklo_ps( _y.get128(), _w.get128() ) );
+}
+
+__forceinline Vector4::Vector4( const Vector3 &xyz, float _w )
+{
+    mVec128 = xyz.get128();
+    _vmathVfSetElement(mVec128, _w, 3);
+}
+
+__forceinline Vector4::Vector4( const Vector3 &xyz, const floatInVec &_w )
+{
+    mVec128 = xyz.get128();
+    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
+}
+
+__forceinline Vector4::Vector4( const Vector3 &vec )
+{
+    mVec128 = vec.get128();
+    mVec128 = _vmathVfInsert(mVec128, _mm_setzero_ps(), 3);
+}
+
+__forceinline Vector4::Vector4( const Point3 &pnt )
+{
+    mVec128 = pnt.get128();
+    mVec128 = _vmathVfInsert(mVec128, _mm_set1_ps(1.0f), 3);
+}
+
+__forceinline Vector4::Vector4( const Quat &quat )
+{
+    mVec128 = quat.get128();
+}
+
+__forceinline Vector4::Vector4( float scalar )
+{
+    mVec128 = floatInVec(scalar).get128();
+}
+
+__forceinline Vector4::Vector4( const floatInVec &scalar )
+{
+    mVec128 = scalar.get128();
+}
+
+__forceinline Vector4::Vector4( __m128 vf4 )
+{
+    mVec128 = vf4;
+}
+
+__forceinline const Vector4 Vector4::xAxis( )
+{
+    return Vector4( _VECTORMATH_UNIT_1000 );
+}
+
+__forceinline const Vector4 Vector4::yAxis( )
+{
+    return Vector4( _VECTORMATH_UNIT_0100 );
+}
+
+__forceinline const Vector4 Vector4::zAxis( )
+{
+    return Vector4( _VECTORMATH_UNIT_0010 );
+}
+
+__forceinline const Vector4 Vector4::wAxis( )
+{
+    return Vector4( _VECTORMATH_UNIT_0001 );
+}
+
+__forceinline const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return lerp( floatInVec(t), vec0, vec1 );
+}
+
+__forceinline const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return ( vec0 + ( ( vec1 - vec0 ) * t ) );
+}
+
+__forceinline const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 )
+{
+    return slerp( floatInVec(t), unitVec0, unitVec1 );
+}
+
+__forceinline const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 )
+{
+    __m128 scales, scale0, scale1, cosAngle, angle, tttt, oneMinusT, angles, sines;
+    cosAngle = _vmathVfDot4( unitVec0.get128(), unitVec1.get128() );
+    __m128 selectMask = _mm_cmpgt_ps( _mm_set1_ps(_VECTORMATH_SLERP_TOL), cosAngle );
+    angle = acosf4( cosAngle );
+    tttt = t.get128();
+    oneMinusT = _mm_sub_ps( _mm_set1_ps(1.0f), tttt );
+    angles = _mm_unpacklo_ps( _mm_set1_ps(1.0f), tttt ); // angles = 1, t, 1, t
+    angles = _mm_unpacklo_ps( angles, oneMinusT );		// angles = 1, 1-t, t, 1-t
+    angles = _mm_mul_ps( angles, angle );
+    sines = sinf4( angles );
+    scales = _mm_div_ps( sines, vec_splat( sines, 0 ) );
+    scale0 = vec_sel( oneMinusT, vec_splat( scales, 1 ), selectMask );
+    scale1 = vec_sel( tttt, vec_splat( scales, 2 ), selectMask );
+    return Vector4( vec_madd( unitVec0.get128(), scale0, _mm_mul_ps( unitVec1.get128(), scale1 ) ) );
+}
+
+__forceinline __m128 Vector4::get128( ) const
+{
+    return mVec128;
+}
+/*
+__forceinline void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads )
+{
+    twoQuads[0] = _vmath2VfToHalfFloats(vec0.get128(), vec1.get128());
+    twoQuads[1] = _vmath2VfToHalfFloats(vec2.get128(), vec3.get128());
+}
+*/
+__forceinline Vector4 & Vector4::operator =( const Vector4 &vec )
+{
+    mVec128 = vec.mVec128;
+    return *this;
+}
+
+__forceinline Vector4 & Vector4::setXYZ( const Vector3 &vec )
+{
+	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff};
+	mVec128 = vec_sel( vec.get128(), mVec128, sw );
+    return *this;
+}
+
+__forceinline const Vector3 Vector4::getXYZ( ) const
+{
+    return Vector3( mVec128 );
+}
+
+__forceinline Vector4 & Vector4::setX( float _x )
+{
+    _vmathVfSetElement(mVec128, _x, 0);
+    return *this;
+}
+
+__forceinline Vector4 & Vector4::setX( const floatInVec &_x )
+{
+    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
+    return *this;
+}
+
+__forceinline const floatInVec Vector4::getX( ) const
+{
+    return floatInVec( mVec128, 0 );
+}
+
+__forceinline Vector4 & Vector4::setY( float _y )
+{
+    _vmathVfSetElement(mVec128, _y, 1);
+    return *this;
+}
+
+__forceinline Vector4 & Vector4::setY( const floatInVec &_y )
+{
+    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
+    return *this;
+}
+
+__forceinline const floatInVec Vector4::getY( ) const
+{
+    return floatInVec( mVec128, 1 );
+}
+
+__forceinline Vector4 & Vector4::setZ( float _z )
+{
+    _vmathVfSetElement(mVec128, _z, 2);
+    return *this;
+}
+
+__forceinline Vector4 & Vector4::setZ( const floatInVec &_z )
+{
+    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
+    return *this;
+}
+
+__forceinline const floatInVec Vector4::getZ( ) const
+{
+    return floatInVec( mVec128, 2 );
+}
+
+__forceinline Vector4 & Vector4::setW( float _w )
+{
+    _vmathVfSetElement(mVec128, _w, 3);
+    return *this;
+}
+
+__forceinline Vector4 & Vector4::setW( const floatInVec &_w )
+{
+    mVec128 = _vmathVfInsert(mVec128, _w.get128(), 3);
+    return *this;
+}
+
+__forceinline const floatInVec Vector4::getW( ) const
+{
+    return floatInVec( mVec128, 3 );
+}
+
+__forceinline Vector4 & Vector4::setElem( int idx, float value )
+{
+    _vmathVfSetElement(mVec128, value, idx);
+    return *this;
+}
+
+__forceinline Vector4 & Vector4::setElem( int idx, const floatInVec &value )
+{
+    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
+    return *this;
+}
+
+__forceinline const floatInVec Vector4::getElem( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+__forceinline VecIdx Vector4::operator []( int idx )
+{
+    return VecIdx( mVec128, idx );
+}
+
+__forceinline const floatInVec Vector4::operator []( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+__forceinline const Vector4 Vector4::operator +( const Vector4 &vec ) const
+{
+    return Vector4( _mm_add_ps( mVec128, vec.mVec128 ) );
+}
+
+__forceinline const Vector4 Vector4::operator -( const Vector4 &vec ) const
+{
+    return Vector4( _mm_sub_ps( mVec128, vec.mVec128 ) );
+}
+
+__forceinline const Vector4 Vector4::operator *( float scalar ) const
+{
+    return *this * floatInVec(scalar);
+}
+
+__forceinline const Vector4 Vector4::operator *( const floatInVec &scalar ) const
+{
+    return Vector4( _mm_mul_ps( mVec128, scalar.get128() ) );
+}
+
+__forceinline Vector4 & Vector4::operator +=( const Vector4 &vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+__forceinline Vector4 & Vector4::operator -=( const Vector4 &vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+__forceinline Vector4 & Vector4::operator *=( float scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+__forceinline Vector4 & Vector4::operator *=( const floatInVec &scalar )
+{
+    *this = *this * scalar;
+    return *this;
+}
+
+__forceinline const Vector4 Vector4::operator /( float scalar ) const
+{
+    return *this / floatInVec(scalar);
+}
+
+__forceinline const Vector4 Vector4::operator /( const floatInVec &scalar ) const
+{
+    return Vector4( _mm_div_ps( mVec128, scalar.get128() ) );
+}
+
+__forceinline Vector4 & Vector4::operator /=( float scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+__forceinline Vector4 & Vector4::operator /=( const floatInVec &scalar )
+{
+    *this = *this / scalar;
+    return *this;
+}
+
+__forceinline const Vector4 Vector4::operator -( ) const
+{
+	return Vector4(_mm_sub_ps( _mm_setzero_ps(), mVec128 ) );
+}
+
+__forceinline const Vector4 operator *( float scalar, const Vector4 &vec )
+{
+    return floatInVec(scalar) * vec;
+}
+
+__forceinline const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec )
+{
+    return vec * scalar;
+}
+
+__forceinline const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return Vector4( _mm_mul_ps( vec0.get128(), vec1.get128() ) );
+}
+
+__forceinline const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return Vector4( _mm_div_ps( vec0.get128(), vec1.get128() ) );
+}
+
+__forceinline const Vector4 recipPerElem( const Vector4 &vec )
+{
+    return Vector4( _mm_rcp_ps( vec.get128() ) );
+}
+
+__forceinline const Vector4 absPerElem( const Vector4 &vec )
+{
+    return Vector4( fabsf4( vec.get128() ) );
+}
+
+__forceinline const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+	__m128 vmask = toM128(0x7fffffff);
+	return Vector4( _mm_or_ps(
+		_mm_and_ps   ( vmask, vec0.get128() ),			// Value
+		_mm_andnot_ps( vmask, vec1.get128() ) ) );		// Signs
+}
+
+__forceinline const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return Vector4( _mm_max_ps( vec0.get128(), vec1.get128() ) );
+}
+
+__forceinline const floatInVec maxElem( const Vector4 &vec )
+{
+    return floatInVec( _mm_max_ps(
+		_mm_max_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
+		_mm_max_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
+}
+
+__forceinline const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return Vector4( _mm_min_ps( vec0.get128(), vec1.get128() ) );
+}
+
+__forceinline const floatInVec minElem( const Vector4 &vec )
+{
+    return floatInVec( _mm_min_ps(
+		_mm_min_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
+		_mm_min_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
+}
+
+__forceinline const floatInVec sum( const Vector4 &vec )
+{
+    return floatInVec( _mm_add_ps(
+		_mm_add_ps( vec_splat( vec.get128(), 0 ), vec_splat( vec.get128(), 1 ) ),
+		_mm_add_ps( vec_splat( vec.get128(), 2 ), vec_splat( vec.get128(), 3 ) ) ) );
+}
+
+__forceinline const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 )
+{
+    return floatInVec( _vmathVfDot4( vec0.get128(), vec1.get128() ), 0 );
+}
+
+__forceinline const floatInVec lengthSqr( const Vector4 &vec )
+{
+    return floatInVec(  _vmathVfDot4( vec.get128(), vec.get128() ), 0 );
+}
+
+__forceinline const floatInVec length( const Vector4 &vec )
+{
+    return floatInVec(  _mm_sqrt_ps(_vmathVfDot4( vec.get128(), vec.get128() )), 0 );
+}
+
+__forceinline const Vector4 normalizeApprox( const Vector4 &vec )
+{
+    return Vector4( _mm_mul_ps( vec.get128(), _mm_rsqrt_ps( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) );
+}
+
+__forceinline const Vector4 normalize( const Vector4 &vec )
+{
+    return Vector4( _mm_mul_ps( vec.get128(), newtonrapson_rsqrt4( _vmathVfDot4( vec.get128(), vec.get128() ) ) ) );
+}
+
+__forceinline const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 )
+{
+    return select( vec0, vec1, boolInVec(select1) );
+}
+
+
+#ifdef _VECTORMATH_DEBUG
+
+__forceinline void print( const Vector4 &vec )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = vec.get128();
+    printf( "( %f %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
+}
+
+__forceinline void print( const Vector4 &vec, const char * name )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = vec.get128();
+    printf( "%s: ( %f %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2], tmp.s[3] );
+}
+
+#endif
+
+__forceinline Point3::Point3( float _x, float _y, float _z )
+{
+    mVec128 = _mm_setr_ps(_x, _y, _z, 0.0f);
+}
+
+__forceinline Point3::Point3( const floatInVec &_x, const floatInVec &_y, const floatInVec &_z )
+{
+	mVec128 = _mm_unpacklo_ps( _mm_unpacklo_ps( _x.get128(), _z.get128() ), _y.get128() );
+}
+
+__forceinline Point3::Point3( const Vector3 &vec )
+{
+    mVec128 = vec.get128();
+}
+
+__forceinline Point3::Point3( float scalar )
+{
+    mVec128 = floatInVec(scalar).get128();
+}
+
+__forceinline Point3::Point3( const floatInVec &scalar )
+{
+    mVec128 = scalar.get128();
+}
+
+__forceinline Point3::Point3( __m128 vf4 )
+{
+    mVec128 = vf4;
+}
+
+__forceinline const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return lerp( floatInVec(t), pnt0, pnt1 );
+}
+
+__forceinline const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return ( pnt0 + ( ( pnt1 - pnt0 ) * t ) );
+}
+
+__forceinline __m128 Point3::get128( ) const
+{
+    return mVec128;
+}
+
+__forceinline void storeXYZ( const Point3 &pnt, __m128 * quad )
+{
+    __m128 dstVec = *quad;
+	__declspec(align(16)) unsigned int sw[4] = {0, 0, 0, 0xffffffff}; // TODO: Centralize
+    dstVec = vec_sel(pnt.get128(), dstVec, sw);
+    *quad = dstVec;
+}
+
+__forceinline void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads )
+{
+	const float *quads = (float *)threeQuads;
+    pnt0 = Point3(  _mm_load_ps(quads) );
+    pnt1 = Point3( _mm_loadu_ps(quads + 3) );
+    pnt2 = Point3( _mm_loadu_ps(quads + 6) );
+    pnt3 = Point3( _mm_loadu_ps(quads + 9) );
+}
+
+__forceinline void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads )
+{
+	__m128 xxxx = _mm_shuffle_ps( pnt1.get128(), pnt1.get128(), _MM_SHUFFLE(0, 0, 0, 0) );
+	__m128 zzzz = _mm_shuffle_ps( pnt2.get128(), pnt2.get128(), _MM_SHUFFLE(2, 2, 2, 2) );
+	__declspec(align(16)) unsigned int xsw[4] = {0, 0, 0, 0xffffffff};
+	__declspec(align(16)) unsigned int zsw[4] = {0xffffffff, 0, 0, 0};
+	threeQuads[0] = vec_sel( pnt0.get128(), xxxx, xsw );
+    threeQuads[1] = _mm_shuffle_ps( pnt1.get128(), pnt2.get128(), _MM_SHUFFLE(1, 0, 2, 1) );
+    threeQuads[2] = vec_sel( _mm_shuffle_ps( pnt3.get128(), pnt3.get128(), _MM_SHUFFLE(2, 1, 0, 3) ), zzzz, zsw );
+}
+/*
+__forceinline void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads )
+{
+#if 0
+    __m128 xyz0[3];
+    __m128 xyz1[3];
+    storeXYZArray( pnt0, pnt1, pnt2, pnt3, xyz0 );
+    storeXYZArray( pnt4, pnt5, pnt6, pnt7, xyz1 );
+    threeQuads[0] = _vmath2VfToHalfFloats(xyz0[0], xyz0[1]);
+    threeQuads[1] = _vmath2VfToHalfFloats(xyz0[2], xyz1[0]);
+    threeQuads[2] = _vmath2VfToHalfFloats(xyz1[1], xyz1[2]);
+#else
+	assert(0);
+#endif
+}
+*/
+__forceinline Point3 & Point3::operator =( const Point3 &pnt )
+{
+    mVec128 = pnt.mVec128;
+    return *this;
+}
+
+__forceinline Point3 & Point3::setX( float _x )
+{
+    _vmathVfSetElement(mVec128, _x, 0);
+    return *this;
+}
+
+__forceinline Point3 & Point3::setX( const floatInVec &_x )
+{
+    mVec128 = _vmathVfInsert(mVec128, _x.get128(), 0);
+    return *this;
+}
+
+__forceinline const floatInVec Point3::getX( ) const
+{
+    return floatInVec( mVec128, 0 );
+}
+
+__forceinline Point3 & Point3::setY( float _y )
+{
+    _vmathVfSetElement(mVec128, _y, 1);
+    return *this;
+}
+
+__forceinline Point3 & Point3::setY( const floatInVec &_y )
+{
+    mVec128 = _vmathVfInsert(mVec128, _y.get128(), 1);
+    return *this;
+}
+
+__forceinline const floatInVec Point3::getY( ) const
+{
+    return floatInVec( mVec128, 1 );
+}
+
+__forceinline Point3 & Point3::setZ( float _z )
+{
+    _vmathVfSetElement(mVec128, _z, 2);
+    return *this;
+}
+
+__forceinline Point3 & Point3::setZ( const floatInVec &_z )
+{
+    mVec128 = _vmathVfInsert(mVec128, _z.get128(), 2);
+    return *this;
+}
+
+__forceinline const floatInVec Point3::getZ( ) const
+{
+    return floatInVec( mVec128, 2 );
+}
+
+__forceinline Point3 & Point3::setElem( int idx, float value )
+{
+    _vmathVfSetElement(mVec128, value, idx);
+    return *this;
+}
+
+__forceinline Point3 & Point3::setElem( int idx, const floatInVec &value )
+{
+    mVec128 = _vmathVfInsert(mVec128, value.get128(), idx);
+    return *this;
+}
+
+__forceinline const floatInVec Point3::getElem( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+__forceinline VecIdx Point3::operator []( int idx )
+{
+    return VecIdx( mVec128, idx );
+}
+
+__forceinline const floatInVec Point3::operator []( int idx ) const
+{
+    return floatInVec( mVec128, idx );
+}
+
+__forceinline const Vector3 Point3::operator -( const Point3 &pnt ) const
+{
+    return Vector3( _mm_sub_ps( mVec128, pnt.mVec128 ) );
+}
+
+__forceinline const Point3 Point3::operator +( const Vector3 &vec ) const
+{
+    return Point3( _mm_add_ps( mVec128, vec.get128() ) );
+}
+
+__forceinline const Point3 Point3::operator -( const Vector3 &vec ) const
+{
+    return Point3( _mm_sub_ps( mVec128, vec.get128() ) );
+}
+
+__forceinline Point3 & Point3::operator +=( const Vector3 &vec )
+{
+    *this = *this + vec;
+    return *this;
+}
+
+__forceinline Point3 & Point3::operator -=( const Vector3 &vec )
+{
+    *this = *this - vec;
+    return *this;
+}
+
+__forceinline const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return Point3( _mm_mul_ps( pnt0.get128(), pnt1.get128() ) );
+}
+
+__forceinline const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return Point3( _mm_div_ps( pnt0.get128(), pnt1.get128() ) );
+}
+
+__forceinline const Point3 recipPerElem( const Point3 &pnt )
+{
+    return Point3( _mm_rcp_ps( pnt.get128() ) );
+}
+
+__forceinline const Point3 absPerElem( const Point3 &pnt )
+{
+    return Point3( fabsf4( pnt.get128() ) );
+}
+
+__forceinline const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+	__m128 vmask = toM128(0x7fffffff);
+	return Point3( _mm_or_ps(
+		_mm_and_ps   ( vmask, pnt0.get128() ),			// Value
+		_mm_andnot_ps( vmask, pnt1.get128() ) ) );		// Signs
+}
+
+__forceinline const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return Point3( _mm_max_ps( pnt0.get128(), pnt1.get128() ) );
+}
+
+__forceinline const floatInVec maxElem( const Point3 &pnt )
+{
+    return floatInVec( _mm_max_ps( _mm_max_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
+}
+
+__forceinline const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return Point3( _mm_min_ps( pnt0.get128(), pnt1.get128() ) );
+}
+
+__forceinline const floatInVec minElem( const Point3 &pnt )
+{
+    return floatInVec( _mm_min_ps( _mm_min_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
+}
+
+__forceinline const floatInVec sum( const Point3 &pnt )
+{
+    return floatInVec( _mm_add_ps( _mm_add_ps( vec_splat( pnt.get128(), 0 ), vec_splat( pnt.get128(), 1 ) ), vec_splat( pnt.get128(), 2 ) ) );
+}
+
+__forceinline const Point3 scale( const Point3 &pnt, float scaleVal )
+{
+    return scale( pnt, floatInVec( scaleVal ) );
+}
+
+__forceinline const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal )
+{
+    return mulPerElem( pnt, Point3( scaleVal ) );
+}
+
+__forceinline const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec )
+{
+    return mulPerElem( pnt, Point3( scaleVec ) );
+}
+
+__forceinline const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec )
+{
+    return floatInVec( _vmathVfDot3( pnt.get128(), unitVec.get128() ), 0 );
+}
+
+__forceinline const floatInVec distSqrFromOrigin( const Point3 &pnt )
+{
+    return lengthSqr( Vector3( pnt ) );
+}
+
+__forceinline const floatInVec distFromOrigin( const Point3 &pnt )
+{
+    return length( Vector3( pnt ) );
+}
+
+__forceinline const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return lengthSqr( ( pnt1 - pnt0 ) );
+}
+
+__forceinline const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 )
+{
+    return length( ( pnt1 - pnt0 ) );
+}
+
+__forceinline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 )
+{
+    return select( pnt0, pnt1, boolInVec(select1) );
+}
+
+__forceinline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 )
+{
+    return Point3( vec_sel( pnt0.get128(), pnt1.get128(), select1.get128() ) );
+}
+
+
+
+#ifdef _VECTORMATH_DEBUG
+
+__forceinline void print( const Point3 &pnt )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = pnt.get128();
+    printf( "( %f %f %f )\n", tmp.s[0], tmp.s[1], tmp.s[2] );
+}
+
+__forceinline void print( const Point3 &pnt, const char * name )
+{
+    union { __m128 v; float s[4]; } tmp;
+    tmp.v = pnt.get128();
+    printf( "%s: ( %f %f %f )\n", name, tmp.s[0], tmp.s[1], tmp.s[2] );
+}
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/src/BulletMultiThreaded/vectormath/sse/vecidx_aos.h b/src/BulletMultiThreaded/vectormath/sse/vecidx_aos.h
new file mode 100644
index 000000000..c5c6eb908
--- /dev/null
+++ b/src/BulletMultiThreaded/vectormath/sse/vecidx_aos.h
@@ -0,0 +1,80 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _VECTORMATH_VECIDX_AOS_H
+#define _VECTORMATH_VECIDX_AOS_H
+
+
+#include "floatInVec.h"
+
+namespace Vectormath {
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// VecIdx 
+// Used in setting elements of Vector3, Vector4, Point3, or Quat with the 
+// subscripting operator.
+//
+
+__declspec(align(16)) class VecIdx
+{
+private:
+   __m128 &ref;
+   int i;
+public:
+    inline VecIdx( __m128& vec, int idx ): ref(vec) { i = idx; }
+
+    // implicitly casts to float unless _VECTORMATH_NO_SCALAR_CAST defined
+    // in which case, implicitly casts to floatInVec, and one must call
+    // getAsFloat to convert to float.
+    //
+#ifdef _VECTORMATH_NO_SCALAR_CAST
+    inline operator floatInVec() const;
+    inline float getAsFloat() const;
+#else
+    inline operator float() const;
+#endif
+
+    inline float operator =( float scalar );
+    inline floatInVec operator =( const floatInVec &scalar );
+    inline floatInVec operator =( const VecIdx& scalar );
+    inline floatInVec operator *=( float scalar );
+    inline floatInVec operator *=( const floatInVec &scalar );
+    inline floatInVec operator /=( float scalar );
+    inline floatInVec operator /=( const floatInVec &scalar );
+    inline floatInVec operator +=( float scalar );
+    inline floatInVec operator +=( const floatInVec &scalar );
+    inline floatInVec operator -=( float scalar );
+    inline floatInVec operator -=( const floatInVec &scalar );
+};
+
+} // namespace Aos
+} // namespace Vectormath
+
+#endif
diff --git a/src/BulletMultiThreaded/vectormath/sse/vectormath_aos.h b/src/BulletMultiThreaded/vectormath/sse/vectormath_aos.h
new file mode 100644
index 000000000..0430ef7af
--- /dev/null
+++ b/src/BulletMultiThreaded/vectormath/sse/vectormath_aos.h
@@ -0,0 +1,2532 @@
+/*
+   Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms,
+   with or without modification, are permitted provided that the
+   following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Sony Computer Entertainment Inc nor the names
+      of its contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#ifndef _VECTORMATH_AOS_CPP_SSE_H
+#define _VECTORMATH_AOS_CPP_SSE_H
+
+#include <math.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <assert.h>
+
+#define Vector3Ref Vector3&
+#define QuatRef	Quat&
+#define Matrix3Ref Matrix3&
+#define VM_ATTRIBUTE_ALIGNED16(a) __declspec(align(16)) a
+
+#define USE_SSE2_LDDQU
+#ifdef USE_SSE2_LDDQU
+#include <intrin.h>  //used for _mm_lddqu_si128
+#endif //USE_SSE2_LDDQU
+
+// TODO: Tidy
+typedef __m128 vec_float4;
+typedef __m128 vec_uint4;
+typedef __m128 vec_int4;
+typedef __m128i vec_uchar16;
+typedef __m128i vec_ushort8;
+
+#define vec_splat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e,e,e,e))
+
+#define _mm_ror_ps(vec,i)	\
+	(((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(i+3)%4,(unsigned char)(i+2)%4,(unsigned char)(i+1)%4,(unsigned char)(i+0)%4))) : (vec))
+#define _mm_rol_ps(vec,i)	\
+	(((i)%4) ? (_mm_shuffle_ps(vec,vec, _MM_SHUFFLE((unsigned char)(7-i)%4,(unsigned char)(6-i)%4,(unsigned char)(5-i)%4,(unsigned char)(4-i)%4))) : (vec))
+
+#define vec_sld(vec,vec2,x) _mm_ror_ps(vec, ((x)/4))
+
+#define _mm_abs_ps(vec)		_mm_andnot_ps(_MASKSIGN_,vec)
+#define _mm_neg_ps(vec)		_mm_xor_ps(_MASKSIGN_,vec)
+
+#define vec_madd(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b) )
+
+union SSEFloat
+{
+	__m128i vi;
+	__m128 m128;
+	__m128 vf;
+	unsigned int	ui[4];
+	unsigned short s[8];
+	float f[4];
+	SSEFloat(__m128 v) : m128(v) {}
+    SSEFloat(__m128i v) : vi(v) {}
+	SSEFloat() {}//uninitialized
+};
+
+static __forceinline __m128 vec_sel(__m128 a, __m128 b, __m128 mask)
+{
+	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
+}
+static __forceinline __m128 vec_sel(__m128 a, __m128 b, const unsigned int *_mask)
+{
+	return vec_sel(a, b, _mm_load_ps((float *)_mask));
+}
+static __forceinline __m128 vec_sel(__m128 a, __m128 b, unsigned int _mask)
+{
+	return vec_sel(a, b, _mm_set1_ps(*(float *)&_mask));
+}
+
+static __forceinline __m128 toM128(unsigned int x)
+{
+    return _mm_set1_ps( *(float *)&x );
+}
+
+static __forceinline __m128 fabsf4(__m128 x)
+{
+    return _mm_and_ps( x, toM128( 0x7fffffff ) );
+}
+/*
+union SSE64
+{
+	__m128 m128;
+	struct
+	{
+		__m64 m01;
+		__m64 m23;
+	} m64;
+};
+
+static __forceinline __m128 vec_cts(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	SSE64 sse64;
+	sse64.m64.m01 = _mm_cvttps_pi32(x);
+	sse64.m64.m23 = _mm_cvttps_pi32(_mm_ror_ps(x,2));
+	_mm_empty();
+    return sse64.m128;
+}
+
+static __forceinline __m128 vec_ctf(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	SSE64 sse64;
+	sse64.m128 = x;
+	__m128 result =_mm_movelh_ps(
+		_mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m01),
+		_mm_cvt_pi2ps(_mm_setzero_ps(), sse64.m64.m23));
+	_mm_empty();
+	return result;
+}
+*/
+static __forceinline __m128 vec_cts(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	__m128i result = _mm_cvtps_epi32(x);
+    return (__m128 &)result;
+}
+
+static __forceinline __m128 vec_ctf(__m128 x, int a)
+{
+	assert(a == 0); // Only 2^0 supported
+	(void)a;
+	return _mm_cvtepi32_ps((__m128i &)x);
+}
+
+#define vec_nmsub(a,b,c) _mm_sub_ps( c, _mm_mul_ps( a, b ) )
+#define vec_sub(a,b) _mm_sub_ps( a, b )
+#define vec_add(a,b) _mm_add_ps( a, b )
+#define vec_mul(a,b) _mm_mul_ps( a, b )
+#define vec_xor(a,b) _mm_xor_ps( a, b )
+#define vec_and(a,b) _mm_and_ps( a, b )
+#define vec_cmpeq(a,b) _mm_cmpeq_ps( a, b )
+#define vec_cmpgt(a,b) _mm_cmpgt_ps( a, b )
+
+#define vec_mergeh(a,b) _mm_unpacklo_ps( a, b )
+#define vec_mergel(a,b) _mm_unpackhi_ps( a, b )
+
+#define vec_andc(a,b) _mm_andnot_ps( b, a )
+
+#define sqrtf4(x) _mm_sqrt_ps( x )
+#define rsqrtf4(x) _mm_rsqrt_ps( x )
+#define recipf4(x) _mm_rcp_ps( x )
+#define negatef4(x) _mm_sub_ps( _mm_setzero_ps(), x )
+
+static __forceinline __m128 newtonrapson_rsqrt4( const __m128 v )
+{   
+#define _half4 _mm_setr_ps(.5f,.5f,.5f,.5f) 
+#define _three _mm_setr_ps(3.f,3.f,3.f,3.f)
+const __m128 approx = _mm_rsqrt_ps( v );   
+const __m128 muls = _mm_mul_ps(_mm_mul_ps(v, approx), approx);   
+return _mm_mul_ps(_mm_mul_ps(_half4, approx), _mm_sub_ps(_three, muls) );
+}
+
+static __forceinline __m128 acosf4(__m128 x)
+{
+    __m128 xabs = fabsf4(x);
+	__m128 select = _mm_cmplt_ps( x, _mm_setzero_ps() );
+    __m128 t1 = sqrtf4(vec_sub(_mm_set1_ps(1.0f), xabs));
+    
+    /* Instruction counts can be reduced if the polynomial was
+     * computed entirely from nested (dependent) fma's. However, 
+     * to reduce the number of pipeline stalls, the polygon is evaluated 
+     * in two halves (hi amd lo). 
+     */
+    __m128 xabs2 = _mm_mul_ps(xabs,  xabs);
+    __m128 xabs4 = _mm_mul_ps(xabs2, xabs2);
+    __m128 hi = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0012624911f),
+		xabs, _mm_set1_ps(0.0066700901f)),
+			xabs, _mm_set1_ps(-0.0170881256f)),
+				xabs, _mm_set1_ps( 0.0308918810f));
+    __m128 lo = vec_madd(vec_madd(vec_madd(_mm_set1_ps(-0.0501743046f),
+		xabs, _mm_set1_ps(0.0889789874f)),
+			xabs, _mm_set1_ps(-0.2145988016f)),
+				xabs, _mm_set1_ps( 1.5707963050f));
+    
+    __m128 result = vec_madd(hi, xabs4, lo);
+    
+    // Adjust the result if x is negactive.
+    return vec_sel(
+		vec_mul(t1, result),									// Positive
+		vec_nmsub(t1, result, _mm_set1_ps(3.1415926535898f)),	// Negative
+		select);
+}
+
+static __forceinline __m128 sinf4(vec_float4 x)
+{
+
+//
+// Common constants used to evaluate sinf4/cosf4/tanf4
+//
+#define _SINCOS_CC0  -0.0013602249f
+#define _SINCOS_CC1   0.0416566950f
+#define _SINCOS_CC2  -0.4999990225f
+#define _SINCOS_SC0  -0.0001950727f
+#define _SINCOS_SC1   0.0083320758f
+#define _SINCOS_SC2  -0.1666665247f
+
+#define _SINCOS_KC1  1.57079625129f
+#define _SINCOS_KC2  7.54978995489e-8f
+
+    vec_float4 xl,xl2,xl3,res;
+
+    // Range reduction using : xl = angle * TwoOverPi;
+    //  
+    xl = vec_mul(x, _mm_set1_ps(0.63661977236f));
+
+    // Find the quadrant the angle falls in
+    // using:  q = (int) (ceil(abs(xl))*sign(xl))
+    //
+    vec_int4 q = vec_cts(xl,0);
+
+    // Compute an offset based on the quadrant that the angle falls in
+    // 
+    vec_int4 offset = _mm_and_ps(q,toM128(0x3));
+
+    // Remainder in range [-pi/4..pi/4]
+    //
+    vec_float4 qf = vec_ctf(q,0);
+    xl  = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x));
+    
+    // Compute x^2 and x^3
+    //
+    xl2 = vec_mul(xl,xl);
+    xl3 = vec_mul(xl2,xl);
+    
+    // Compute both the sin and cos of the angles
+    // using a polynomial expression:
+    //   cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and
+    //   sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
+    //
+    
+    vec_float4 cx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f));
+    vec_float4 sx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl);
+
+    // Use the cosine when the offset is odd and the sin
+    // when the offset is even
+    //
+    res = vec_sel(cx,sx,vec_cmpeq(vec_and(offset,
+                                          toM128(0x1)),
+										  _mm_setzero_ps()));
+
+    // Flip the sign of the result when (offset mod 4) = 1 or 2
+    //
+    return vec_sel(
+		vec_xor(toM128(0x80000000U), res),	// Negative
+		res,								// Positive
+		vec_cmpeq(vec_and(offset,toM128(0x2)),_mm_setzero_ps()));
+}
+
+static __forceinline void sincosf4(vec_float4 x, vec_float4* s, vec_float4* c)
+{
+    vec_float4 xl,xl2,xl3;
+    vec_int4   offsetSin, offsetCos;
+
+    // Range reduction using : xl = angle * TwoOverPi;
+    //  
+    xl = vec_mul(x, _mm_set1_ps(0.63661977236f));
+
+    // Find the quadrant the angle falls in
+    // using:  q = (int) (ceil(abs(xl))*sign(xl))
+    //
+    //vec_int4 q = vec_cts(vec_add(xl,vec_sel(_mm_set1_ps(0.5f),xl,(0x80000000))),0);
+    vec_int4 q = vec_cts(xl,0);
+     
+    // Compute the offset based on the quadrant that the angle falls in.
+    // Add 1 to the offset for the cosine. 
+    //
+    offsetSin = vec_and(q,toM128((int)0x3));
+	__m128i temp = _mm_add_epi32(_mm_set1_epi32(1),(__m128i &)offsetSin);
+	offsetCos = (__m128 &)temp;
+
+    // Remainder in range [-pi/4..pi/4]
+    //
+    vec_float4 qf = vec_ctf(q,0);
+    xl  = vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC2),vec_nmsub(qf,_mm_set1_ps(_SINCOS_KC1),x));
+    
+    // Compute x^2 and x^3
+    //
+    xl2 = vec_mul(xl,xl);
+    xl3 = vec_mul(xl2,xl);
+    
+    // Compute both the sin and cos of the angles
+    // using a polynomial expression:
+    //   cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2), and
+    //   sx = xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
+    //
+    vec_float4 cx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_CC0),xl2,_mm_set1_ps(_SINCOS_CC1)),xl2,_mm_set1_ps(_SINCOS_CC2)),xl2,_mm_set1_ps(1.0f));
+    vec_float4 sx =
+		vec_madd(
+			vec_madd(
+				vec_madd(_mm_set1_ps(_SINCOS_SC0),xl2,_mm_set1_ps(_SINCOS_SC1)),xl2,_mm_set1_ps(_SINCOS_SC2)),xl3,xl);
+
+    // Use the cosine when the offset is odd and the sin
+    // when the offset is even
+    //
+    vec_uint4 sinMask = (vec_uint4)vec_cmpeq(vec_and(offsetSin,toM128(0x1)),_mm_setzero_ps());
+    vec_uint4 cosMask = (vec_uint4)vec_cmpeq(vec_and(offsetCos,toM128(0x1)),_mm_setzero_ps());    
+    *s = vec_sel(cx,sx,sinMask);
+    *c = vec_sel(cx,sx,cosMask);
+
+    // Flip the sign of the result when (offset mod 4) = 1 or 2
+    //
+    sinMask = vec_cmpeq(vec_and(offsetSin,toM128(0x2)),_mm_setzero_ps());
+    cosMask = vec_cmpeq(vec_and(offsetCos,toM128(0x2)),_mm_setzero_ps());
+    
+    *s = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*s),*s,sinMask);
+    *c = vec_sel((vec_float4)vec_xor(toM128(0x80000000),(vec_uint4)*c),*c,cosMask);    
+}
+
+#include "vecidx_aos.h"
+#include "floatInVec.h"
+#include "boolInVec.h"
+
+#ifdef _VECTORMATH_DEBUG
+#include <stdio.h>
+#endif
+namespace Vectormath {
+
+namespace Aos {
+
+//-----------------------------------------------------------------------------
+// Forward Declarations
+//
+
+class Vector3;
+class Vector4;
+class Point3;
+class Quat;
+class Matrix3;
+class Matrix4;
+class Transform3;
+
+// A 3-D vector in array-of-structures format
+//
+class Vector3
+{
+    __m128 mVec128;
+
+	__forceinline void set128(vec_float4 vec);
+	 
+	 __forceinline  vec_float4& get128Ref();
+
+public:
+    // Default constructor; does no initialization
+    // 
+    __forceinline Vector3( ) { };
+
+	// Default copy constructor
+    // 
+	__forceinline Vector3(const Vector3& vec);
+
+    // Construct a 3-D vector from x, y, and z elements
+    // 
+    __forceinline Vector3( float x, float y, float z );
+
+    // Construct a 3-D vector from x, y, and z elements (scalar data contained in vector data type)
+    // 
+    __forceinline Vector3( const floatInVec &x, const floatInVec &y, const floatInVec &z );
+
+    // Copy elements from a 3-D point into a 3-D vector
+    // 
+    explicit __forceinline Vector3( const Point3 &pnt );
+
+    // Set all elements of a 3-D vector to the same scalar value
+    // 
+    explicit __forceinline Vector3( float scalar );
+
+    // Set all elements of a 3-D vector to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit __forceinline Vector3( const floatInVec &scalar );
+
+    // Set vector float data in a 3-D vector
+    // 
+    explicit __forceinline Vector3( __m128 vf4 );
+
+    // Get vector float data from a 3-D vector
+    // 
+    __forceinline __m128 get128( ) const;
+
+    // Assign one 3-D vector to another
+    // 
+    __forceinline Vector3 & operator =( const Vector3 &vec );
+
+    // Set the x element of a 3-D vector
+    // 
+    __forceinline Vector3 & setX( float x );
+
+    // Set the y element of a 3-D vector
+    // 
+    __forceinline Vector3 & setY( float y );
+
+    // Set the z element of a 3-D vector
+    // 
+    __forceinline Vector3 & setZ( float z );
+
+    // Set the x element of a 3-D vector (scalar data contained in vector data type)
+    // 
+    __forceinline Vector3 & setX( const floatInVec &x );
+
+    // Set the y element of a 3-D vector (scalar data contained in vector data type)
+    // 
+    __forceinline Vector3 & setY( const floatInVec &y );
+
+    // Set the z element of a 3-D vector (scalar data contained in vector data type)
+    // 
+    __forceinline Vector3 & setZ( const floatInVec &z );
+
+    // Get the x element of a 3-D vector
+    // 
+    __forceinline const floatInVec getX( ) const;
+
+    // Get the y element of a 3-D vector
+    // 
+    __forceinline const floatInVec getY( ) const;
+
+    // Get the z element of a 3-D vector
+    // 
+    __forceinline const floatInVec getZ( ) const;
+
+    // Set an x, y, or z element of a 3-D vector by index
+    // 
+    __forceinline Vector3 & setElem( int idx, float value );
+
+    // Set an x, y, or z element of a 3-D vector by index (scalar data contained in vector data type)
+    // 
+    __forceinline Vector3 & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, or z element of a 3-D vector by index
+    // 
+    __forceinline const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    __forceinline VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    __forceinline const floatInVec operator []( int idx ) const;
+
+    // Add two 3-D vectors
+    // 
+    __forceinline const Vector3 operator +( const Vector3 &vec ) const;
+
+    // Subtract a 3-D vector from another 3-D vector
+    // 
+    __forceinline const Vector3 operator -( const Vector3 &vec ) const;
+
+    // Add a 3-D vector to a 3-D point
+    // 
+    __forceinline const Point3 operator +( const Point3 &pnt ) const;
+
+    // Multiply a 3-D vector by a scalar
+    // 
+    __forceinline const Vector3 operator *( float scalar ) const;
+
+    // Divide a 3-D vector by a scalar
+    // 
+    __forceinline const Vector3 operator /( float scalar ) const;
+
+    // Multiply a 3-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Vector3 operator *( const floatInVec &scalar ) const;
+
+    // Divide a 3-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Vector3 operator /( const floatInVec &scalar ) const;
+
+    // Perform compound assignment and addition with a 3-D vector
+    // 
+    __forceinline Vector3 & operator +=( const Vector3 &vec );
+
+    // Perform compound assignment and subtraction by a 3-D vector
+    // 
+    __forceinline Vector3 & operator -=( const Vector3 &vec );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    __forceinline Vector3 & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    __forceinline Vector3 & operator /=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Vector3 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Vector3 & operator /=( const floatInVec &scalar );
+
+    // Negate all elements of a 3-D vector
+    // 
+    __forceinline const Vector3 operator -( ) const;
+
+    // Construct x axis
+    // 
+    static __forceinline const Vector3 xAxis( );
+
+    // Construct y axis
+    // 
+    static __forceinline const Vector3 yAxis( );
+
+    // Construct z axis
+    // 
+    static __forceinline const Vector3 zAxis( );
+
+};
+
+// Multiply a 3-D vector by a scalar
+// 
+__forceinline const Vector3 operator *( float scalar, const Vector3 &vec );
+
+// Multiply a 3-D vector by a scalar (scalar data contained in vector data type)
+// 
+__forceinline const Vector3 operator *( const floatInVec &scalar, const Vector3 &vec );
+
+// Multiply two 3-D vectors per element
+// 
+__forceinline const Vector3 mulPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Divide two 3-D vectors per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+__forceinline const Vector3 divPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Compute the reciprocal of a 3-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+__forceinline const Vector3 recipPerElem( const Vector3 &vec );
+
+// Compute the absolute value of a 3-D vector per element
+// 
+__forceinline const Vector3 absPerElem( const Vector3 &vec );
+
+// Copy sign from one 3-D vector to another, per element
+// 
+__forceinline const Vector3 copySignPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Maximum of two 3-D vectors per element
+// 
+__forceinline const Vector3 maxPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Minimum of two 3-D vectors per element
+// 
+__forceinline const Vector3 minPerElem( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Maximum element of a 3-D vector
+// 
+__forceinline const floatInVec maxElem( const Vector3 &vec );
+
+// Minimum element of a 3-D vector
+// 
+__forceinline const floatInVec minElem( const Vector3 &vec );
+
+// Compute the sum of all elements of a 3-D vector
+// 
+__forceinline const floatInVec sum( const Vector3 &vec );
+
+// Compute the dot product of two 3-D vectors
+// 
+__forceinline const floatInVec dot( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Compute the square of the length of a 3-D vector
+// 
+__forceinline const floatInVec lengthSqr( const Vector3 &vec );
+
+// Compute the length of a 3-D vector
+// 
+__forceinline const floatInVec length( const Vector3 &vec );
+
+// Normalize a 3-D vector
+// NOTE: 
+// The result is unpredictable when all elements of vec are at or near zero.
+// 
+__forceinline const Vector3 normalize( const Vector3 &vec );
+
+// Compute cross product of two 3-D vectors
+// 
+__forceinline const Vector3 cross( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Outer product of two 3-D vectors
+// 
+__forceinline const Matrix3 outer( const Vector3 &vec0, const Vector3 &vec1 );
+
+// Pre-multiply a row vector by a 3x3 matrix
+// NOTE: 
+// Slower than column post-multiply.
+// 
+__forceinline const Vector3 rowMul( const Vector3 &vec, const Matrix3 & mat );
+
+// Cross-product matrix of a 3-D vector
+// 
+__forceinline const Matrix3 crossMatrix( const Vector3 &vec );
+
+// Create cross-product matrix and multiply
+// NOTE: 
+// Faster than separately creating a cross-product matrix and multiplying.
+// 
+__forceinline const Matrix3 crossMatrixMul( const Vector3 &vec, const Matrix3 & mat );
+
+// Linear interpolation between two 3-D vectors
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector3 lerp( float t, const Vector3 &vec0, const Vector3 &vec1 );
+
+// Linear interpolation between two 3-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector3 lerp( const floatInVec &t, const Vector3 &vec0, const Vector3 &vec1 );
+
+// Spherical linear interpolation between two 3-D vectors
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector3 slerp( float t, const Vector3 &unitVec0, const Vector3 &unitVec1 );
+
+// Spherical linear interpolation between two 3-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector3 slerp( const floatInVec &t, const Vector3 &unitVec0, const Vector3 &unitVec1 );
+
+// Conditionally select between two 3-D vectors
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+__forceinline const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, bool select1 );
+
+// Conditionally select between two 3-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+__forceinline const Vector3 select( const Vector3 &vec0, const Vector3 &vec1, const boolInVec &select1 );
+
+// Store x, y, and z elements of 3-D vector in first three words of a quadword, preserving fourth word
+// 
+__forceinline void storeXYZ( const Vector3 &vec, __m128 * quad );
+
+// Load four three-float 3-D vectors, stored in three quadwords
+// 
+__forceinline void loadXYZArray( Vector3 & vec0, Vector3 & vec1, Vector3 & vec2, Vector3 & vec3, const __m128 * threeQuads );
+
+// Store four 3-D vectors in three quadwords
+// 
+__forceinline void storeXYZArray( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, __m128 * threeQuads );
+
+// Store eight 3-D vectors as half-floats
+// 
+__forceinline void storeHalfFloats( const Vector3 &vec0, const Vector3 &vec1, const Vector3 &vec2, const Vector3 &vec3, const Vector3 &vec4, const Vector3 &vec5, const Vector3 &vec6, const Vector3 &vec7, vec_ushort8 * threeQuads );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3-D vector
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Vector3 &vec );
+
+// Print a 3-D vector and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Vector3 &vec, const char * name );
+
+#endif
+
+// A 4-D vector in array-of-structures format
+//
+class Vector4
+{
+    __m128 mVec128;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    __forceinline Vector4( ) { };
+
+    // Construct a 4-D vector from x, y, z, and w elements
+    // 
+    __forceinline Vector4( float x, float y, float z, float w );
+
+    // Construct a 4-D vector from x, y, z, and w elements (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w );
+
+    // Construct a 4-D vector from a 3-D vector and a scalar
+    // 
+    __forceinline Vector4( const Vector3 &xyz, float w );
+
+    // Construct a 4-D vector from a 3-D vector and a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4( const Vector3 &xyz, const floatInVec &w );
+
+    // Copy x, y, and z from a 3-D vector into a 4-D vector, and set w to 0
+    // 
+    explicit __forceinline Vector4( const Vector3 &vec );
+
+    // Copy x, y, and z from a 3-D point into a 4-D vector, and set w to 1
+    // 
+    explicit __forceinline Vector4( const Point3 &pnt );
+
+    // Copy elements from a quaternion into a 4-D vector
+    // 
+    explicit __forceinline Vector4( const Quat &quat );
+
+    // Set all elements of a 4-D vector to the same scalar value
+    // 
+    explicit __forceinline Vector4( float scalar );
+
+    // Set all elements of a 4-D vector to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit __forceinline Vector4( const floatInVec &scalar );
+
+    // Set vector float data in a 4-D vector
+    // 
+    explicit __forceinline Vector4( __m128 vf4 );
+
+    // Get vector float data from a 4-D vector
+    // 
+    __forceinline __m128 get128( ) const;
+
+    // Assign one 4-D vector to another
+    // 
+    __forceinline Vector4 & operator =( const Vector4 &vec );
+
+    // Set the x, y, and z elements of a 4-D vector
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    __forceinline Vector4 & setXYZ( const Vector3 &vec );
+
+    // Get the x, y, and z elements of a 4-D vector
+    // 
+    __forceinline const Vector3 getXYZ( ) const;
+
+    // Set the x element of a 4-D vector
+    // 
+    __forceinline Vector4 & setX( float x );
+
+    // Set the y element of a 4-D vector
+    // 
+    __forceinline Vector4 & setY( float y );
+
+    // Set the z element of a 4-D vector
+    // 
+    __forceinline Vector4 & setZ( float z );
+
+    // Set the w element of a 4-D vector
+    // 
+    __forceinline Vector4 & setW( float w );
+
+    // Set the x element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4 & setX( const floatInVec &x );
+
+    // Set the y element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4 & setY( const floatInVec &y );
+
+    // Set the z element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4 & setZ( const floatInVec &z );
+
+    // Set the w element of a 4-D vector (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4 & setW( const floatInVec &w );
+
+    // Get the x element of a 4-D vector
+    // 
+    __forceinline const floatInVec getX( ) const;
+
+    // Get the y element of a 4-D vector
+    // 
+    __forceinline const floatInVec getY( ) const;
+
+    // Get the z element of a 4-D vector
+    // 
+    __forceinline const floatInVec getZ( ) const;
+
+    // Get the w element of a 4-D vector
+    // 
+    __forceinline const floatInVec getW( ) const;
+
+    // Set an x, y, z, or w element of a 4-D vector by index
+    // 
+    __forceinline Vector4 & setElem( int idx, float value );
+
+    // Set an x, y, z, or w element of a 4-D vector by index (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4 & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, z, or w element of a 4-D vector by index
+    // 
+    __forceinline const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    __forceinline VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    __forceinline const floatInVec operator []( int idx ) const;
+
+    // Add two 4-D vectors
+    // 
+    __forceinline const Vector4 operator +( const Vector4 &vec ) const;
+
+    // Subtract a 4-D vector from another 4-D vector
+    // 
+    __forceinline const Vector4 operator -( const Vector4 &vec ) const;
+
+    // Multiply a 4-D vector by a scalar
+    // 
+    __forceinline const Vector4 operator *( float scalar ) const;
+
+    // Divide a 4-D vector by a scalar
+    // 
+    __forceinline const Vector4 operator /( float scalar ) const;
+
+    // Multiply a 4-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Vector4 operator *( const floatInVec &scalar ) const;
+
+    // Divide a 4-D vector by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Vector4 operator /( const floatInVec &scalar ) const;
+
+    // Perform compound assignment and addition with a 4-D vector
+    // 
+    __forceinline Vector4 & operator +=( const Vector4 &vec );
+
+    // Perform compound assignment and subtraction by a 4-D vector
+    // 
+    __forceinline Vector4 & operator -=( const Vector4 &vec );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    __forceinline Vector4 & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    __forceinline Vector4 & operator /=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Vector4 & operator /=( const floatInVec &scalar );
+
+    // Negate all elements of a 4-D vector
+    // 
+    __forceinline const Vector4 operator -( ) const;
+
+    // Construct x axis
+    // 
+    static __forceinline const Vector4 xAxis( );
+
+    // Construct y axis
+    // 
+    static __forceinline const Vector4 yAxis( );
+
+    // Construct z axis
+    // 
+    static __forceinline const Vector4 zAxis( );
+
+    // Construct w axis
+    // 
+    static __forceinline const Vector4 wAxis( );
+
+};
+
+// Multiply a 4-D vector by a scalar
+// 
+__forceinline const Vector4 operator *( float scalar, const Vector4 &vec );
+
+// Multiply a 4-D vector by a scalar (scalar data contained in vector data type)
+// 
+__forceinline const Vector4 operator *( const floatInVec &scalar, const Vector4 &vec );
+
+// Multiply two 4-D vectors per element
+// 
+__forceinline const Vector4 mulPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Divide two 4-D vectors per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+__forceinline const Vector4 divPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Compute the reciprocal of a 4-D vector per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+__forceinline const Vector4 recipPerElem( const Vector4 &vec );
+
+// Compute the absolute value of a 4-D vector per element
+// 
+__forceinline const Vector4 absPerElem( const Vector4 &vec );
+
+// Copy sign from one 4-D vector to another, per element
+// 
+__forceinline const Vector4 copySignPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Maximum of two 4-D vectors per element
+// 
+__forceinline const Vector4 maxPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Minimum of two 4-D vectors per element
+// 
+__forceinline const Vector4 minPerElem( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Maximum element of a 4-D vector
+// 
+__forceinline const floatInVec maxElem( const Vector4 &vec );
+
+// Minimum element of a 4-D vector
+// 
+__forceinline const floatInVec minElem( const Vector4 &vec );
+
+// Compute the sum of all elements of a 4-D vector
+// 
+__forceinline const floatInVec sum( const Vector4 &vec );
+
+// Compute the dot product of two 4-D vectors
+// 
+__forceinline const floatInVec dot( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Compute the square of the length of a 4-D vector
+// 
+__forceinline const floatInVec lengthSqr( const Vector4 &vec );
+
+// Compute the length of a 4-D vector
+// 
+__forceinline const floatInVec length( const Vector4 &vec );
+
+// Normalize a 4-D vector
+// NOTE: 
+// The result is unpredictable when all elements of vec are at or near zero.
+// 
+__forceinline const Vector4 normalize( const Vector4 &vec );
+
+// Outer product of two 4-D vectors
+// 
+__forceinline const Matrix4 outer( const Vector4 &vec0, const Vector4 &vec1 );
+
+// Linear interpolation between two 4-D vectors
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector4 lerp( float t, const Vector4 &vec0, const Vector4 &vec1 );
+
+// Linear interpolation between two 4-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector4 lerp( const floatInVec &t, const Vector4 &vec0, const Vector4 &vec1 );
+
+// Spherical linear interpolation between two 4-D vectors
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector4 slerp( float t, const Vector4 &unitVec0, const Vector4 &unitVec1 );
+
+// Spherical linear interpolation between two 4-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// The result is unpredictable if the vectors point in opposite directions.
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Vector4 slerp( const floatInVec &t, const Vector4 &unitVec0, const Vector4 &unitVec1 );
+
+// Conditionally select between two 4-D vectors
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+__forceinline const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, bool select1 );
+
+// Conditionally select between two 4-D vectors (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+__forceinline const Vector4 select( const Vector4 &vec0, const Vector4 &vec1, const boolInVec &select1 );
+
+// Store four 4-D vectors as half-floats
+// 
+__forceinline void storeHalfFloats( const Vector4 &vec0, const Vector4 &vec1, const Vector4 &vec2, const Vector4 &vec3, vec_ushort8 * twoQuads );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 4-D vector
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Vector4 &vec );
+
+// Print a 4-D vector and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Vector4 &vec, const char * name );
+
+#endif
+
+// A 3-D point in array-of-structures format
+//
+class Point3
+{
+    __m128 mVec128;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    __forceinline Point3( ) { };
+
+    // Construct a 3-D point from x, y, and z elements
+    // 
+    __forceinline Point3( float x, float y, float z );
+
+    // Construct a 3-D point from x, y, and z elements (scalar data contained in vector data type)
+    // 
+    __forceinline Point3( const floatInVec &x, const floatInVec &y, const floatInVec &z );
+
+    // Copy elements from a 3-D vector into a 3-D point
+    // 
+    explicit __forceinline Point3( const Vector3 &vec );
+
+    // Set all elements of a 3-D point to the same scalar value
+    // 
+    explicit __forceinline Point3( float scalar );
+
+    // Set all elements of a 3-D point to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit __forceinline Point3( const floatInVec &scalar );
+
+    // Set vector float data in a 3-D point
+    // 
+    explicit __forceinline Point3( __m128 vf4 );
+
+    // Get vector float data from a 3-D point
+    // 
+    __forceinline __m128 get128( ) const;
+
+    // Assign one 3-D point to another
+    // 
+    __forceinline Point3 & operator =( const Point3 &pnt );
+
+    // Set the x element of a 3-D point
+    // 
+    __forceinline Point3 & setX( float x );
+
+    // Set the y element of a 3-D point
+    // 
+    __forceinline Point3 & setY( float y );
+
+    // Set the z element of a 3-D point
+    // 
+    __forceinline Point3 & setZ( float z );
+
+    // Set the x element of a 3-D point (scalar data contained in vector data type)
+    // 
+    __forceinline Point3 & setX( const floatInVec &x );
+
+    // Set the y element of a 3-D point (scalar data contained in vector data type)
+    // 
+    __forceinline Point3 & setY( const floatInVec &y );
+
+    // Set the z element of a 3-D point (scalar data contained in vector data type)
+    // 
+    __forceinline Point3 & setZ( const floatInVec &z );
+
+    // Get the x element of a 3-D point
+    // 
+    __forceinline const floatInVec getX( ) const;
+
+    // Get the y element of a 3-D point
+    // 
+    __forceinline const floatInVec getY( ) const;
+
+    // Get the z element of a 3-D point
+    // 
+    __forceinline const floatInVec getZ( ) const;
+
+    // Set an x, y, or z element of a 3-D point by index
+    // 
+    __forceinline Point3 & setElem( int idx, float value );
+
+    // Set an x, y, or z element of a 3-D point by index (scalar data contained in vector data type)
+    // 
+    __forceinline Point3 & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, or z element of a 3-D point by index
+    // 
+    __forceinline const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    __forceinline VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    __forceinline const floatInVec operator []( int idx ) const;
+
+    // Subtract a 3-D point from another 3-D point
+    // 
+    __forceinline const Vector3 operator -( const Point3 &pnt ) const;
+
+    // Add a 3-D point to a 3-D vector
+    // 
+    __forceinline const Point3 operator +( const Vector3 &vec ) const;
+
+    // Subtract a 3-D vector from a 3-D point
+    // 
+    __forceinline const Point3 operator -( const Vector3 &vec ) const;
+
+    // Perform compound assignment and addition with a 3-D vector
+    // 
+    __forceinline Point3 & operator +=( const Vector3 &vec );
+
+    // Perform compound assignment and subtraction by a 3-D vector
+    // 
+    __forceinline Point3 & operator -=( const Vector3 &vec );
+
+};
+
+// Multiply two 3-D points per element
+// 
+__forceinline const Point3 mulPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Divide two 3-D points per element
+// NOTE: 
+// Floating-point behavior matches standard library function divf4.
+// 
+__forceinline const Point3 divPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Compute the reciprocal of a 3-D point per element
+// NOTE: 
+// Floating-point behavior matches standard library function recipf4.
+// 
+__forceinline const Point3 recipPerElem( const Point3 &pnt );
+
+// Compute the absolute value of a 3-D point per element
+// 
+__forceinline const Point3 absPerElem( const Point3 &pnt );
+
+// Copy sign from one 3-D point to another, per element
+// 
+__forceinline const Point3 copySignPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Maximum of two 3-D points per element
+// 
+__forceinline const Point3 maxPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Minimum of two 3-D points per element
+// 
+__forceinline const Point3 minPerElem( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Maximum element of a 3-D point
+// 
+__forceinline const floatInVec maxElem( const Point3 &pnt );
+
+// Minimum element of a 3-D point
+// 
+__forceinline const floatInVec minElem( const Point3 &pnt );
+
+// Compute the sum of all elements of a 3-D point
+// 
+__forceinline const floatInVec sum( const Point3 &pnt );
+
+// Apply uniform scale to a 3-D point
+// 
+__forceinline const Point3 scale( const Point3 &pnt, float scaleVal );
+
+// Apply uniform scale to a 3-D point (scalar data contained in vector data type)
+// 
+__forceinline const Point3 scale( const Point3 &pnt, const floatInVec &scaleVal );
+
+// Apply non-uniform scale to a 3-D point
+// 
+__forceinline const Point3 scale( const Point3 &pnt, const Vector3 &scaleVec );
+
+// Scalar projection of a 3-D point on a unit-length 3-D vector
+// 
+__forceinline const floatInVec projection( const Point3 &pnt, const Vector3 &unitVec );
+
+// Compute the square of the distance of a 3-D point from the coordinate-system origin
+// 
+__forceinline const floatInVec distSqrFromOrigin( const Point3 &pnt );
+
+// Compute the distance of a 3-D point from the coordinate-system origin
+// 
+__forceinline const floatInVec distFromOrigin( const Point3 &pnt );
+
+// Compute the square of the distance between two 3-D points
+// 
+__forceinline const floatInVec distSqr( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Compute the distance between two 3-D points
+// 
+__forceinline const floatInVec dist( const Point3 &pnt0, const Point3 &pnt1 );
+
+// Linear interpolation between two 3-D points
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Point3 lerp( float t, const Point3 &pnt0, const Point3 &pnt1 );
+
+// Linear interpolation between two 3-D points (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Point3 lerp( const floatInVec &t, const Point3 &pnt0, const Point3 &pnt1 );
+
+// Conditionally select between two 3-D points
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+__forceinline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, bool select1 );
+
+// Conditionally select between two 3-D points (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+__forceinline const Point3 select( const Point3 &pnt0, const Point3 &pnt1, const boolInVec &select1 );
+
+// Store x, y, and z elements of 3-D point in first three words of a quadword, preserving fourth word
+// 
+__forceinline void storeXYZ( const Point3 &pnt, __m128 * quad );
+
+// Load four three-float 3-D points, stored in three quadwords
+// 
+__forceinline void loadXYZArray( Point3 & pnt0, Point3 & pnt1, Point3 & pnt2, Point3 & pnt3, const __m128 * threeQuads );
+
+// Store four 3-D points in three quadwords
+// 
+__forceinline void storeXYZArray( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, __m128 * threeQuads );
+
+// Store eight 3-D points as half-floats
+// 
+__forceinline void storeHalfFloats( const Point3 &pnt0, const Point3 &pnt1, const Point3 &pnt2, const Point3 &pnt3, const Point3 &pnt4, const Point3 &pnt5, const Point3 &pnt6, const Point3 &pnt7, vec_ushort8 * threeQuads );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3-D point
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Point3 &pnt );
+
+// Print a 3-D point and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Point3 &pnt, const char * name );
+
+#endif
+
+// A quaternion in array-of-structures format
+//
+class Quat
+{
+    __m128 mVec128;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    __forceinline Quat( ) { };
+
+	__forceinline  Quat::Quat(const Quat& quat);
+
+    // Construct a quaternion from x, y, z, and w elements
+    // 
+    __forceinline Quat( float x, float y, float z, float w );
+
+    // Construct a quaternion from x, y, z, and w elements (scalar data contained in vector data type)
+    // 
+    __forceinline Quat( const floatInVec &x, const floatInVec &y, const floatInVec &z, const floatInVec &w );
+
+    // Construct a quaternion from a 3-D vector and a scalar
+    // 
+    __forceinline Quat( const Vector3 &xyz, float w );
+
+    // Construct a quaternion from a 3-D vector and a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Quat( const Vector3 &xyz, const floatInVec &w );
+
+    // Copy elements from a 4-D vector into a quaternion
+    // 
+    explicit __forceinline Quat( const Vector4 &vec );
+
+    // Convert a rotation matrix to a unit-length quaternion
+    // 
+    explicit __forceinline Quat( const Matrix3 & rotMat );
+
+    // Set all elements of a quaternion to the same scalar value
+    // 
+    explicit __forceinline Quat( float scalar );
+
+    // Set all elements of a quaternion to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit __forceinline Quat( const floatInVec &scalar );
+
+    // Set vector float data in a quaternion
+    // 
+    explicit __forceinline Quat( __m128 vf4 );
+
+    // Get vector float data from a quaternion
+    // 
+    __forceinline __m128 get128( ) const;
+
+	// Set a quaterion from vector float data
+    //
+	__forceinline void set128(vec_float4 vec);
+
+    // Assign one quaternion to another
+    // 
+    __forceinline Quat & operator =( const Quat &quat );
+
+    // Set the x, y, and z elements of a quaternion
+    // NOTE: 
+    // This function does not change the w element.
+    // 
+    __forceinline Quat & setXYZ( const Vector3 &vec );
+
+    // Get the x, y, and z elements of a quaternion
+    // 
+    __forceinline const Vector3 getXYZ( ) const;
+
+    // Set the x element of a quaternion
+    // 
+    __forceinline Quat & setX( float x );
+
+    // Set the y element of a quaternion
+    // 
+    __forceinline Quat & setY( float y );
+
+    // Set the z element of a quaternion
+    // 
+    __forceinline Quat & setZ( float z );
+
+    // Set the w element of a quaternion
+    // 
+    __forceinline Quat & setW( float w );
+
+    // Set the x element of a quaternion (scalar data contained in vector data type)
+    // 
+    __forceinline Quat & setX( const floatInVec &x );
+
+    // Set the y element of a quaternion (scalar data contained in vector data type)
+    // 
+    __forceinline Quat & setY( const floatInVec &y );
+
+    // Set the z element of a quaternion (scalar data contained in vector data type)
+    // 
+    __forceinline Quat & setZ( const floatInVec &z );
+
+    // Set the w element of a quaternion (scalar data contained in vector data type)
+    // 
+    __forceinline Quat & setW( const floatInVec &w );
+
+    // Get the x element of a quaternion
+    // 
+    __forceinline const floatInVec getX( ) const;
+
+    // Get the y element of a quaternion
+    // 
+    __forceinline const floatInVec getY( ) const;
+
+    // Get the z element of a quaternion
+    // 
+    __forceinline const floatInVec getZ( ) const;
+
+    // Get the w element of a quaternion
+    // 
+    __forceinline const floatInVec getW( ) const;
+
+    // Set an x, y, z, or w element of a quaternion by index
+    // 
+    __forceinline Quat & setElem( int idx, float value );
+
+    // Set an x, y, z, or w element of a quaternion by index (scalar data contained in vector data type)
+    // 
+    __forceinline Quat & setElem( int idx, const floatInVec &value );
+
+    // Get an x, y, z, or w element of a quaternion by index
+    // 
+    __forceinline const floatInVec getElem( int idx ) const;
+
+    // Subscripting operator to set or get an element
+    // 
+    __forceinline VecIdx operator []( int idx );
+
+    // Subscripting operator to get an element
+    // 
+    __forceinline const floatInVec operator []( int idx ) const;
+
+    // Add two quaternions
+    // 
+    __forceinline const Quat operator +( const Quat &quat ) const;
+
+    // Subtract a quaternion from another quaternion
+    // 
+    __forceinline const Quat operator -( const Quat &quat ) const;
+
+    // Multiply two quaternions
+    // 
+    __forceinline const Quat operator *( const Quat &quat ) const;
+
+    // Multiply a quaternion by a scalar
+    // 
+    __forceinline const Quat operator *( float scalar ) const;
+
+    // Divide a quaternion by a scalar
+    // 
+    __forceinline const Quat operator /( float scalar ) const;
+
+    // Multiply a quaternion by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Quat operator *( const floatInVec &scalar ) const;
+
+    // Divide a quaternion by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Quat operator /( const floatInVec &scalar ) const;
+
+    // Perform compound assignment and addition with a quaternion
+    // 
+    __forceinline Quat & operator +=( const Quat &quat );
+
+    // Perform compound assignment and subtraction by a quaternion
+    // 
+    __forceinline Quat & operator -=( const Quat &quat );
+
+    // Perform compound assignment and multiplication by a quaternion
+    // 
+    __forceinline Quat & operator *=( const Quat &quat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    __forceinline Quat & operator *=( float scalar );
+
+    // Perform compound assignment and division by a scalar
+    // 
+    __forceinline Quat & operator /=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Quat & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and division by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Quat & operator /=( const floatInVec &scalar );
+
+    // Negate all elements of a quaternion
+    // 
+    __forceinline const Quat operator -( ) const;
+
+    // Construct an identity quaternion
+    // 
+    static __forceinline const Quat identity( );
+
+    // Construct a quaternion to rotate between two unit-length 3-D vectors
+    // NOTE: 
+    // The result is unpredictable if unitVec0 and unitVec1 point in opposite directions.
+    // 
+    static __forceinline const Quat rotation( const Vector3 &unitVec0, const Vector3 &unitVec1 );
+
+    // Construct a quaternion to rotate around a unit-length 3-D vector
+    // 
+    static __forceinline const Quat rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a quaternion to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static __forceinline const Quat rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a quaternion to rotate around the x axis
+    // 
+    static __forceinline const Quat rotationX( float radians );
+
+    // Construct a quaternion to rotate around the y axis
+    // 
+    static __forceinline const Quat rotationY( float radians );
+
+    // Construct a quaternion to rotate around the z axis
+    // 
+    static __forceinline const Quat rotationZ( float radians );
+
+    // Construct a quaternion to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Quat rotationX( const floatInVec &radians );
+
+    // Construct a quaternion to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Quat rotationY( const floatInVec &radians );
+
+    // Construct a quaternion to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Quat rotationZ( const floatInVec &radians );
+
+};
+
+// Multiply a quaternion by a scalar
+// 
+__forceinline const Quat operator *( float scalar, const Quat &quat );
+
+// Multiply a quaternion by a scalar (scalar data contained in vector data type)
+// 
+__forceinline const Quat operator *( const floatInVec &scalar, const Quat &quat );
+
+// Compute the conjugate of a quaternion
+// 
+__forceinline const Quat conj( const Quat &quat );
+
+// Use a unit-length quaternion to rotate a 3-D vector
+// 
+__forceinline const Vector3 rotate( const Quat &unitQuat, const Vector3 &vec );
+
+// Compute the dot product of two quaternions
+// 
+__forceinline const floatInVec dot( const Quat &quat0, const Quat &quat1 );
+
+// Compute the norm of a quaternion
+// 
+__forceinline const floatInVec norm( const Quat &quat );
+
+// Compute the length of a quaternion
+// 
+__forceinline const floatInVec length( const Quat &quat );
+
+// Normalize a quaternion
+// NOTE: 
+// The result is unpredictable when all elements of quat are at or near zero.
+// 
+__forceinline const Quat normalize( const Quat &quat );
+
+// Linear interpolation between two quaternions
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Quat lerp( float t, const Quat &quat0, const Quat &quat1 );
+
+// Linear interpolation between two quaternions (scalar data contained in vector data type)
+// NOTE: 
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Quat lerp( const floatInVec &t, const Quat &quat0, const Quat &quat1 );
+
+// Spherical linear interpolation between two quaternions
+// NOTE: 
+// Interpolates along the shortest path between orientations.
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Quat slerp( float t, const Quat &unitQuat0, const Quat &unitQuat1 );
+
+// Spherical linear interpolation between two quaternions (scalar data contained in vector data type)
+// NOTE: 
+// Interpolates along the shortest path between orientations.
+// Does not clamp t between 0 and 1.
+// 
+__forceinline const Quat slerp( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1 );
+
+// Spherical quadrangle interpolation
+// 
+__forceinline const Quat squad( float t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 );
+
+// Spherical quadrangle interpolation (scalar data contained in vector data type)
+// 
+__forceinline const Quat squad( const floatInVec &t, const Quat &unitQuat0, const Quat &unitQuat1, const Quat &unitQuat2, const Quat &unitQuat3 );
+
+// Conditionally select between two quaternions
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+__forceinline const Quat select( const Quat &quat0, const Quat &quat1, bool select1 );
+
+// Conditionally select between two quaternions (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+__forceinline const Quat select( const Quat &quat0, const Quat &quat1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a quaternion
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Quat &quat );
+
+// Print a quaternion and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Quat &quat, const char * name );
+
+#endif
+
+// A 3x3 matrix in array-of-structures format
+//
+class Matrix3
+{
+    Vector3 mCol0;
+    Vector3 mCol1;
+    Vector3 mCol2;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    __forceinline Matrix3( ) { };
+
+    // Copy a 3x3 matrix
+    // 
+    __forceinline Matrix3( const Matrix3 & mat );
+
+    // Construct a 3x3 matrix containing the specified columns
+    // 
+    __forceinline Matrix3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2 );
+
+    // Construct a 3x3 rotation matrix from a unit-length quaternion
+    // 
+    explicit __forceinline Matrix3( const Quat &unitQuat );
+
+    // Set all elements of a 3x3 matrix to the same scalar value
+    // 
+    explicit __forceinline Matrix3( float scalar );
+
+    // Set all elements of a 3x3 matrix to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit __forceinline Matrix3( const floatInVec &scalar );
+
+    // Assign one 3x3 matrix to another
+    // 
+    __forceinline Matrix3 & operator =( const Matrix3 & mat );
+
+    // Set column 0 of a 3x3 matrix
+    // 
+    __forceinline Matrix3 & setCol0( const Vector3 &col0 );
+
+    // Set column 1 of a 3x3 matrix
+    // 
+    __forceinline Matrix3 & setCol1( const Vector3 &col1 );
+
+    // Set column 2 of a 3x3 matrix
+    // 
+    __forceinline Matrix3 & setCol2( const Vector3 &col2 );
+
+    // Get column 0 of a 3x3 matrix
+    // 
+    __forceinline const Vector3 getCol0( ) const;
+
+    // Get column 1 of a 3x3 matrix
+    // 
+    __forceinline const Vector3 getCol1( ) const;
+
+    // Get column 2 of a 3x3 matrix
+    // 
+    __forceinline const Vector3 getCol2( ) const;
+
+    // Set the column of a 3x3 matrix referred to by the specified index
+    // 
+    __forceinline Matrix3 & setCol( int col, const Vector3 &vec );
+
+    // Set the row of a 3x3 matrix referred to by the specified index
+    // 
+    __forceinline Matrix3 & setRow( int row, const Vector3 &vec );
+
+    // Get the column of a 3x3 matrix referred to by the specified index
+    // 
+    __forceinline const Vector3 getCol( int col ) const;
+
+    // Get the row of a 3x3 matrix referred to by the specified index
+    // 
+    __forceinline const Vector3 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    __forceinline Vector3 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    __forceinline const Vector3 operator []( int col ) const;
+
+    // Set the element of a 3x3 matrix referred to by column and row indices
+    // 
+    __forceinline Matrix3 & setElem( int col, int row, float val );
+
+    // Set the element of a 3x3 matrix referred to by column and row indices (scalar data contained in vector data type)
+    // 
+    __forceinline Matrix3 & setElem( int col, int row, const floatInVec &val );
+
+    // Get the element of a 3x3 matrix referred to by column and row indices
+    // 
+    __forceinline const floatInVec getElem( int col, int row ) const;
+
+    // Add two 3x3 matrices
+    // 
+    __forceinline const Matrix3 operator +( const Matrix3 & mat ) const;
+
+    // Subtract a 3x3 matrix from another 3x3 matrix
+    // 
+    __forceinline const Matrix3 operator -( const Matrix3 & mat ) const;
+
+    // Negate all elements of a 3x3 matrix
+    // 
+    __forceinline const Matrix3 operator -( ) const;
+
+    // Multiply a 3x3 matrix by a scalar
+    // 
+    __forceinline const Matrix3 operator *( float scalar ) const;
+
+    // Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Matrix3 operator *( const floatInVec &scalar ) const;
+
+    // Multiply a 3x3 matrix by a 3-D vector
+    // 
+    __forceinline const Vector3 operator *( const Vector3 &vec ) const;
+
+    // Multiply two 3x3 matrices
+    // 
+    __forceinline const Matrix3 operator *( const Matrix3 & mat ) const;
+
+    // Perform compound assignment and addition with a 3x3 matrix
+    // 
+    __forceinline Matrix3 & operator +=( const Matrix3 & mat );
+
+    // Perform compound assignment and subtraction by a 3x3 matrix
+    // 
+    __forceinline Matrix3 & operator -=( const Matrix3 & mat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    __forceinline Matrix3 & operator *=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Matrix3 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and multiplication by a 3x3 matrix
+    // 
+    __forceinline Matrix3 & operator *=( const Matrix3 & mat );
+
+    // Construct an identity 3x3 matrix
+    // 
+    static __forceinline const Matrix3 identity( );
+
+    // Construct a 3x3 matrix to rotate around the x axis
+    // 
+    static __forceinline const Matrix3 rotationX( float radians );
+
+    // Construct a 3x3 matrix to rotate around the y axis
+    // 
+    static __forceinline const Matrix3 rotationY( float radians );
+
+    // Construct a 3x3 matrix to rotate around the z axis
+    // 
+    static __forceinline const Matrix3 rotationZ( float radians );
+
+    // Construct a 3x3 matrix to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix3 rotationX( const floatInVec &radians );
+
+    // Construct a 3x3 matrix to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix3 rotationY( const floatInVec &radians );
+
+    // Construct a 3x3 matrix to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix3 rotationZ( const floatInVec &radians );
+
+    // Construct a 3x3 matrix to rotate around the x, y, and z axes
+    // 
+    static __forceinline const Matrix3 rotationZYX( const Vector3 &radiansXYZ );
+
+    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector
+    // 
+    static __forceinline const Matrix3 rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a 3x3 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix3 rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static __forceinline const Matrix3 rotation( const Quat &unitQuat );
+
+    // Construct a 3x3 matrix to perform scaling
+    // 
+    static __forceinline const Matrix3 scale( const Vector3 &scaleVec );
+
+};
+// Multiply a 3x3 matrix by a scalar
+// 
+__forceinline const Matrix3 operator *( float scalar, const Matrix3 & mat );
+
+// Multiply a 3x3 matrix by a scalar (scalar data contained in vector data type)
+// 
+__forceinline const Matrix3 operator *( const floatInVec &scalar, const Matrix3 & mat );
+
+// Append (post-multiply) a scale transformation to a 3x3 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+__forceinline const Matrix3 appendScale( const Matrix3 & mat, const Vector3 &scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 3x3 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+__forceinline const Matrix3 prependScale( const Vector3 &scaleVec, const Matrix3 & mat );
+
+// Multiply two 3x3 matrices per element
+// 
+__forceinline const Matrix3 mulPerElem( const Matrix3 & mat0, const Matrix3 & mat1 );
+
+// Compute the absolute value of a 3x3 matrix per element
+// 
+__forceinline const Matrix3 absPerElem( const Matrix3 & mat );
+
+// Transpose of a 3x3 matrix
+// 
+__forceinline const Matrix3 transpose( const Matrix3 & mat );
+
+// Compute the inverse of a 3x3 matrix
+// NOTE: 
+// Result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+__forceinline const Matrix3 inverse( const Matrix3 & mat );
+
+// Determinant of a 3x3 matrix
+// 
+__forceinline const floatInVec determinant( const Matrix3 & mat );
+
+// Conditionally select between two 3x3 matrices
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+__forceinline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, bool select1 );
+
+// Conditionally select between two 3x3 matrices (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+__forceinline const Matrix3 select( const Matrix3 & mat0, const Matrix3 & mat1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3x3 matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Matrix3 & mat );
+
+// Print a 3x3 matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Matrix3 & mat, const char * name );
+
+#endif
+
+// A 4x4 matrix in array-of-structures format
+//
+class Matrix4
+{
+    Vector4 mCol0;
+    Vector4 mCol1;
+    Vector4 mCol2;
+    Vector4 mCol3;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    __forceinline Matrix4( ) { };
+
+    // Copy a 4x4 matrix
+    // 
+    __forceinline Matrix4( const Matrix4 & mat );
+
+    // Construct a 4x4 matrix containing the specified columns
+    // 
+    __forceinline Matrix4( const Vector4 &col0, const Vector4 &col1, const Vector4 &col2, const Vector4 &col3 );
+
+    // Construct a 4x4 matrix from a 3x4 transformation matrix
+    // 
+    explicit __forceinline Matrix4( const Transform3 & mat );
+
+    // Construct a 4x4 matrix from a 3x3 matrix and a 3-D vector
+    // 
+    __forceinline Matrix4( const Matrix3 & mat, const Vector3 &translateVec );
+
+    // Construct a 4x4 matrix from a unit-length quaternion and a 3-D vector
+    // 
+    __forceinline Matrix4( const Quat &unitQuat, const Vector3 &translateVec );
+
+    // Set all elements of a 4x4 matrix to the same scalar value
+    // 
+    explicit __forceinline Matrix4( float scalar );
+
+    // Set all elements of a 4x4 matrix to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit __forceinline Matrix4( const floatInVec &scalar );
+
+    // Assign one 4x4 matrix to another
+    // 
+    __forceinline Matrix4 & operator =( const Matrix4 & mat );
+
+    // Set the upper-left 3x3 submatrix
+    // NOTE: 
+    // This function does not change the bottom row elements.
+    // 
+    __forceinline Matrix4 & setUpper3x3( const Matrix3 & mat3 );
+
+    // Get the upper-left 3x3 submatrix of a 4x4 matrix
+    // 
+    __forceinline const Matrix3 getUpper3x3( ) const;
+
+    // Set translation component
+    // NOTE: 
+    // This function does not change the bottom row elements.
+    // 
+    __forceinline Matrix4 & setTranslation( const Vector3 &translateVec );
+
+    // Get the translation component of a 4x4 matrix
+    // 
+    __forceinline const Vector3 getTranslation( ) const;
+
+    // Set column 0 of a 4x4 matrix
+    // 
+    __forceinline Matrix4 & setCol0( const Vector4 &col0 );
+
+    // Set column 1 of a 4x4 matrix
+    // 
+    __forceinline Matrix4 & setCol1( const Vector4 &col1 );
+
+    // Set column 2 of a 4x4 matrix
+    // 
+    __forceinline Matrix4 & setCol2( const Vector4 &col2 );
+
+    // Set column 3 of a 4x4 matrix
+    // 
+    __forceinline Matrix4 & setCol3( const Vector4 &col3 );
+
+    // Get column 0 of a 4x4 matrix
+    // 
+    __forceinline const Vector4 getCol0( ) const;
+
+    // Get column 1 of a 4x4 matrix
+    // 
+    __forceinline const Vector4 getCol1( ) const;
+
+    // Get column 2 of a 4x4 matrix
+    // 
+    __forceinline const Vector4 getCol2( ) const;
+
+    // Get column 3 of a 4x4 matrix
+    // 
+    __forceinline const Vector4 getCol3( ) const;
+
+    // Set the column of a 4x4 matrix referred to by the specified index
+    // 
+    __forceinline Matrix4 & setCol( int col, const Vector4 &vec );
+
+    // Set the row of a 4x4 matrix referred to by the specified index
+    // 
+    __forceinline Matrix4 & setRow( int row, const Vector4 &vec );
+
+    // Get the column of a 4x4 matrix referred to by the specified index
+    // 
+    __forceinline const Vector4 getCol( int col ) const;
+
+    // Get the row of a 4x4 matrix referred to by the specified index
+    // 
+    __forceinline const Vector4 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    __forceinline Vector4 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    __forceinline const Vector4 operator []( int col ) const;
+
+    // Set the element of a 4x4 matrix referred to by column and row indices
+    // 
+    __forceinline Matrix4 & setElem( int col, int row, float val );
+
+    // Set the element of a 4x4 matrix referred to by column and row indices (scalar data contained in vector data type)
+    // 
+    __forceinline Matrix4 & setElem( int col, int row, const floatInVec &val );
+
+    // Get the element of a 4x4 matrix referred to by column and row indices
+    // 
+    __forceinline const floatInVec getElem( int col, int row ) const;
+
+    // Add two 4x4 matrices
+    // 
+    __forceinline const Matrix4 operator +( const Matrix4 & mat ) const;
+
+    // Subtract a 4x4 matrix from another 4x4 matrix
+    // 
+    __forceinline const Matrix4 operator -( const Matrix4 & mat ) const;
+
+    // Negate all elements of a 4x4 matrix
+    // 
+    __forceinline const Matrix4 operator -( ) const;
+
+    // Multiply a 4x4 matrix by a scalar
+    // 
+    __forceinline const Matrix4 operator *( float scalar ) const;
+
+    // Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline const Matrix4 operator *( const floatInVec &scalar ) const;
+
+    // Multiply a 4x4 matrix by a 4-D vector
+    // 
+    __forceinline const Vector4 operator *( const Vector4 &vec ) const;
+
+    // Multiply a 4x4 matrix by a 3-D vector
+    // 
+    __forceinline const Vector4 operator *( const Vector3 &vec ) const;
+
+    // Multiply a 4x4 matrix by a 3-D point
+    // 
+    __forceinline const Vector4 operator *( const Point3 &pnt ) const;
+
+    // Multiply two 4x4 matrices
+    // 
+    __forceinline const Matrix4 operator *( const Matrix4 & mat ) const;
+
+    // Multiply a 4x4 matrix by a 3x4 transformation matrix
+    // 
+    __forceinline const Matrix4 operator *( const Transform3 & tfrm ) const;
+
+    // Perform compound assignment and addition with a 4x4 matrix
+    // 
+    __forceinline Matrix4 & operator +=( const Matrix4 & mat );
+
+    // Perform compound assignment and subtraction by a 4x4 matrix
+    // 
+    __forceinline Matrix4 & operator -=( const Matrix4 & mat );
+
+    // Perform compound assignment and multiplication by a scalar
+    // 
+    __forceinline Matrix4 & operator *=( float scalar );
+
+    // Perform compound assignment and multiplication by a scalar (scalar data contained in vector data type)
+    // 
+    __forceinline Matrix4 & operator *=( const floatInVec &scalar );
+
+    // Perform compound assignment and multiplication by a 4x4 matrix
+    // 
+    __forceinline Matrix4 & operator *=( const Matrix4 & mat );
+
+    // Perform compound assignment and multiplication by a 3x4 transformation matrix
+    // 
+    __forceinline Matrix4 & operator *=( const Transform3 & tfrm );
+
+    // Construct an identity 4x4 matrix
+    // 
+    static __forceinline const Matrix4 identity( );
+
+    // Construct a 4x4 matrix to rotate around the x axis
+    // 
+    static __forceinline const Matrix4 rotationX( float radians );
+
+    // Construct a 4x4 matrix to rotate around the y axis
+    // 
+    static __forceinline const Matrix4 rotationY( float radians );
+
+    // Construct a 4x4 matrix to rotate around the z axis
+    // 
+    static __forceinline const Matrix4 rotationZ( float radians );
+
+    // Construct a 4x4 matrix to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix4 rotationX( const floatInVec &radians );
+
+    // Construct a 4x4 matrix to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix4 rotationY( const floatInVec &radians );
+
+    // Construct a 4x4 matrix to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix4 rotationZ( const floatInVec &radians );
+
+    // Construct a 4x4 matrix to rotate around the x, y, and z axes
+    // 
+    static __forceinline const Matrix4 rotationZYX( const Vector3 &radiansXYZ );
+
+    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector
+    // 
+    static __forceinline const Matrix4 rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a 4x4 matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static __forceinline const Matrix4 rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static __forceinline const Matrix4 rotation( const Quat &unitQuat );
+
+    // Construct a 4x4 matrix to perform scaling
+    // 
+    static __forceinline const Matrix4 scale( const Vector3 &scaleVec );
+
+    // Construct a 4x4 matrix to perform translation
+    // 
+    static __forceinline const Matrix4 translation( const Vector3 &translateVec );
+
+    // Construct viewing matrix based on eye, position looked at, and up direction
+    // 
+    static __forceinline const Matrix4 lookAt( const Point3 &eyePos, const Point3 &lookAtPos, const Vector3 &upVec );
+
+    // Construct a perspective projection matrix
+    // 
+    static __forceinline const Matrix4 perspective( float fovyRadians, float aspect, float zNear, float zFar );
+
+    // Construct a perspective projection matrix based on frustum
+    // 
+    static __forceinline const Matrix4 frustum( float left, float right, float bottom, float top, float zNear, float zFar );
+
+    // Construct an orthographic projection matrix
+    // 
+    static __forceinline const Matrix4 orthographic( float left, float right, float bottom, float top, float zNear, float zFar );
+
+};
+// Multiply a 4x4 matrix by a scalar
+// 
+__forceinline const Matrix4 operator *( float scalar, const Matrix4 & mat );
+
+// Multiply a 4x4 matrix by a scalar (scalar data contained in vector data type)
+// 
+__forceinline const Matrix4 operator *( const floatInVec &scalar, const Matrix4 & mat );
+
+// Append (post-multiply) a scale transformation to a 4x4 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+__forceinline const Matrix4 appendScale( const Matrix4 & mat, const Vector3 &scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 4x4 matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+__forceinline const Matrix4 prependScale( const Vector3 &scaleVec, const Matrix4 & mat );
+
+// Multiply two 4x4 matrices per element
+// 
+__forceinline const Matrix4 mulPerElem( const Matrix4 & mat0, const Matrix4 & mat1 );
+
+// Compute the absolute value of a 4x4 matrix per element
+// 
+__forceinline const Matrix4 absPerElem( const Matrix4 & mat );
+
+// Transpose of a 4x4 matrix
+// 
+__forceinline const Matrix4 transpose( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix
+// NOTE: 
+// Result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+__forceinline const Matrix4 inverse( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.  The result is unpredictable when the determinant of mat is equal to or near 0.
+// 
+__forceinline const Matrix4 affineInverse( const Matrix4 & mat );
+
+// Compute the inverse of a 4x4 matrix, which is expected to be an affine matrix with an orthogonal upper-left 3x3 submatrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 4x4 matrix meets the given restrictions.
+// 
+__forceinline const Matrix4 orthoInverse( const Matrix4 & mat );
+
+// Determinant of a 4x4 matrix
+// 
+__forceinline const floatInVec determinant( const Matrix4 & mat );
+
+// Conditionally select between two 4x4 matrices
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+__forceinline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, bool select1 );
+
+// Conditionally select between two 4x4 matrices (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+__forceinline const Matrix4 select( const Matrix4 & mat0, const Matrix4 & mat1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 4x4 matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Matrix4 & mat );
+
+// Print a 4x4 matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Matrix4 & mat, const char * name );
+
+#endif
+
+// A 3x4 transformation matrix in array-of-structures format
+//
+class Transform3
+{
+    Vector3 mCol0;
+    Vector3 mCol1;
+    Vector3 mCol2;
+    Vector3 mCol3;
+
+public:
+    // Default constructor; does no initialization
+    // 
+    __forceinline Transform3( ) { };
+
+    // Copy a 3x4 transformation matrix
+    // 
+    __forceinline Transform3( const Transform3 & tfrm );
+
+    // Construct a 3x4 transformation matrix containing the specified columns
+    // 
+    __forceinline Transform3( const Vector3 &col0, const Vector3 &col1, const Vector3 &col2, const Vector3 &col3 );
+
+    // Construct a 3x4 transformation matrix from a 3x3 matrix and a 3-D vector
+    // 
+    __forceinline Transform3( const Matrix3 & tfrm, const Vector3 &translateVec );
+
+    // Construct a 3x4 transformation matrix from a unit-length quaternion and a 3-D vector
+    // 
+    __forceinline Transform3( const Quat &unitQuat, const Vector3 &translateVec );
+
+    // Set all elements of a 3x4 transformation matrix to the same scalar value
+    // 
+    explicit __forceinline Transform3( float scalar );
+
+    // Set all elements of a 3x4 transformation matrix to the same scalar value (scalar data contained in vector data type)
+    // 
+    explicit __forceinline Transform3( const floatInVec &scalar );
+
+    // Assign one 3x4 transformation matrix to another
+    // 
+    __forceinline Transform3 & operator =( const Transform3 & tfrm );
+
+    // Set the upper-left 3x3 submatrix
+    // 
+    __forceinline Transform3 & setUpper3x3( const Matrix3 & mat3 );
+
+    // Get the upper-left 3x3 submatrix of a 3x4 transformation matrix
+    // 
+    __forceinline const Matrix3 getUpper3x3( ) const;
+
+    // Set translation component
+    // 
+    __forceinline Transform3 & setTranslation( const Vector3 &translateVec );
+
+    // Get the translation component of a 3x4 transformation matrix
+    // 
+    __forceinline const Vector3 getTranslation( ) const;
+
+    // Set column 0 of a 3x4 transformation matrix
+    // 
+    __forceinline Transform3 & setCol0( const Vector3 &col0 );
+
+    // Set column 1 of a 3x4 transformation matrix
+    // 
+    __forceinline Transform3 & setCol1( const Vector3 &col1 );
+
+    // Set column 2 of a 3x4 transformation matrix
+    // 
+    __forceinline Transform3 & setCol2( const Vector3 &col2 );
+
+    // Set column 3 of a 3x4 transformation matrix
+    // 
+    __forceinline Transform3 & setCol3( const Vector3 &col3 );
+
+    // Get column 0 of a 3x4 transformation matrix
+    // 
+    __forceinline const Vector3 getCol0( ) const;
+
+    // Get column 1 of a 3x4 transformation matrix
+    // 
+    __forceinline const Vector3 getCol1( ) const;
+
+    // Get column 2 of a 3x4 transformation matrix
+    // 
+    __forceinline const Vector3 getCol2( ) const;
+
+    // Get column 3 of a 3x4 transformation matrix
+    // 
+    __forceinline const Vector3 getCol3( ) const;
+
+    // Set the column of a 3x4 transformation matrix referred to by the specified index
+    // 
+    __forceinline Transform3 & setCol( int col, const Vector3 &vec );
+
+    // Set the row of a 3x4 transformation matrix referred to by the specified index
+    // 
+    __forceinline Transform3 & setRow( int row, const Vector4 &vec );
+
+    // Get the column of a 3x4 transformation matrix referred to by the specified index
+    // 
+    __forceinline const Vector3 getCol( int col ) const;
+
+    // Get the row of a 3x4 transformation matrix referred to by the specified index
+    // 
+    __forceinline const Vector4 getRow( int row ) const;
+
+    // Subscripting operator to set or get a column
+    // 
+    __forceinline Vector3 & operator []( int col );
+
+    // Subscripting operator to get a column
+    // 
+    __forceinline const Vector3 operator []( int col ) const;
+
+    // Set the element of a 3x4 transformation matrix referred to by column and row indices
+    // 
+    __forceinline Transform3 & setElem( int col, int row, float val );
+
+    // Set the element of a 3x4 transformation matrix referred to by column and row indices (scalar data contained in vector data type)
+    // 
+    __forceinline Transform3 & setElem( int col, int row, const floatInVec &val );
+
+    // Get the element of a 3x4 transformation matrix referred to by column and row indices
+    // 
+    __forceinline const floatInVec getElem( int col, int row ) const;
+
+    // Multiply a 3x4 transformation matrix by a 3-D vector
+    // 
+    __forceinline const Vector3 operator *( const Vector3 &vec ) const;
+
+    // Multiply a 3x4 transformation matrix by a 3-D point
+    // 
+    __forceinline const Point3 operator *( const Point3 &pnt ) const;
+
+    // Multiply two 3x4 transformation matrices
+    // 
+    __forceinline const Transform3 operator *( const Transform3 & tfrm ) const;
+
+    // Perform compound assignment and multiplication by a 3x4 transformation matrix
+    // 
+    __forceinline Transform3 & operator *=( const Transform3 & tfrm );
+
+    // Construct an identity 3x4 transformation matrix
+    // 
+    static __forceinline const Transform3 identity( );
+
+    // Construct a 3x4 transformation matrix to rotate around the x axis
+    // 
+    static __forceinline const Transform3 rotationX( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the y axis
+    // 
+    static __forceinline const Transform3 rotationY( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the z axis
+    // 
+    static __forceinline const Transform3 rotationZ( float radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the x axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Transform3 rotationX( const floatInVec &radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the y axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Transform3 rotationY( const floatInVec &radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the z axis (scalar data contained in vector data type)
+    // 
+    static __forceinline const Transform3 rotationZ( const floatInVec &radians );
+
+    // Construct a 3x4 transformation matrix to rotate around the x, y, and z axes
+    // 
+    static __forceinline const Transform3 rotationZYX( const Vector3 &radiansXYZ );
+
+    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector
+    // 
+    static __forceinline const Transform3 rotation( float radians, const Vector3 &unitVec );
+
+    // Construct a 3x4 transformation matrix to rotate around a unit-length 3-D vector (scalar data contained in vector data type)
+    // 
+    static __forceinline const Transform3 rotation( const floatInVec &radians, const Vector3 &unitVec );
+
+    // Construct a rotation matrix from a unit-length quaternion
+    // 
+    static __forceinline const Transform3 rotation( const Quat &unitQuat );
+
+    // Construct a 3x4 transformation matrix to perform scaling
+    // 
+    static __forceinline const Transform3 scale( const Vector3 &scaleVec );
+
+    // Construct a 3x4 transformation matrix to perform translation
+    // 
+    static __forceinline const Transform3 translation( const Vector3 &translateVec );
+
+};
+// Append (post-multiply) a scale transformation to a 3x4 transformation matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+__forceinline const Transform3 appendScale( const Transform3 & tfrm, const Vector3 &scaleVec );
+
+// Prepend (pre-multiply) a scale transformation to a 3x4 transformation matrix
+// NOTE: 
+// Faster than creating and multiplying a scale transformation matrix.
+// 
+__forceinline const Transform3 prependScale( const Vector3 &scaleVec, const Transform3 & tfrm );
+
+// Multiply two 3x4 transformation matrices per element
+// 
+__forceinline const Transform3 mulPerElem( const Transform3 & tfrm0, const Transform3 & tfrm1 );
+
+// Compute the absolute value of a 3x4 transformation matrix per element
+// 
+__forceinline const Transform3 absPerElem( const Transform3 & tfrm );
+
+// Inverse of a 3x4 transformation matrix
+// NOTE: 
+// Result is unpredictable when the determinant of the left 3x3 submatrix is equal to or near 0.
+// 
+__forceinline const Transform3 inverse( const Transform3 & tfrm );
+
+// Compute the inverse of a 3x4 transformation matrix, expected to have an orthogonal upper-left 3x3 submatrix
+// NOTE: 
+// This can be used to achieve better performance than a general inverse when the specified 3x4 transformation matrix meets the given restrictions.
+// 
+__forceinline const Transform3 orthoInverse( const Transform3 & tfrm );
+
+// Conditionally select between two 3x4 transformation matrices
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// However, the transfer of select1 to a VMX register may use more processing time than a branch.
+// Use the boolInVec version for better performance.
+// 
+__forceinline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, bool select1 );
+
+// Conditionally select between two 3x4 transformation matrices (scalar data contained in vector data type)
+// NOTE: 
+// This function uses a conditional select instruction to avoid a branch.
+// 
+__forceinline const Transform3 select( const Transform3 & tfrm0, const Transform3 & tfrm1, const boolInVec &select1 );
+
+#ifdef _VECTORMATH_DEBUG
+
+// Print a 3x4 transformation matrix
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Transform3 & tfrm );
+
+// Print a 3x4 transformation matrix and an associated string identifier
+// NOTE: 
+// Function is only defined when _VECTORMATH_DEBUG is defined.
+// 
+__forceinline void print( const Transform3 & tfrm, const char * name );
+
+#endif
+
+} // namespace Aos
+} // namespace Vectormath
+
+#include "vec_aos.h"
+#include "quat_aos.h"
+#include "mat_aos.h"
+
+#endif
diff --git a/src/BulletMultiThreaded/vectormath2bullet.h b/src/BulletMultiThreaded/vectormath2bullet.h
index 5a4944a55..df0f688d4 100644
--- a/src/BulletMultiThreaded/vectormath2bullet.h
+++ b/src/BulletMultiThreaded/vectormath2bullet.h
@@ -31,12 +31,7 @@
 #define AOS_VECTORMATH_BULLET_CONVERT_H
 
 
-///only use a system-wide vectormath_aos.h on CELLOS_LV2 or if USE_SYSTEM_VECTORMATH
-#if defined(__CELLOS_LV2__) || defined (USE_SYSTEM_VECTORMATH)
-#include <vectormath_aos.h>
-#else
-#include "BulletMultiThreaded/vectormath/scalar/cpp/vectormath_aos.h"
-#endif
+#include "vectormath_aos.h"
 
 #include "LinearMath/btVector3.h"
 #include "LinearMath/btQuaternion.h"
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6c26661a8..d70e6004c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,8 +1,4 @@
-if (CMAKE_SIZEOF_VOID_P MATCHES "8")
-SUBDIRS( BulletSoftBody BulletCollision BulletDynamics LinearMath )
-else (CMAKE_SIZEOF_VOID_P MATCHES "8")
 SUBDIRS( BulletMultiThreaded BulletSoftBody BulletCollision BulletDynamics LinearMath )
-endif (CMAKE_SIZEOF_VOID_P MATCHES "8")
 
 IF(INSTALL_LIBS)
 	#INSTALL of other files requires CMake 2.6
diff --git a/src/LinearMath/btMinMax.h b/src/LinearMath/btMinMax.h
index 33ea10ce3..80601c1e2 100644
--- a/src/LinearMath/btMinMax.h
+++ b/src/LinearMath/btMinMax.h
@@ -17,7 +17,7 @@ subject to the following restrictions:
 #ifndef GEN_MINMAX_H
 #define GEN_MINMAX_H
 
-#include "btScalar.h"
+#include "LinearMath/btScalar.h"
 
 template <class T>
 SIMD_FORCE_INLINE const T& btMin(const T& a, const T& b) 
@@ -32,7 +32,7 @@ SIMD_FORCE_INLINE const T& btMax(const T& a, const T& b)
 }
 
 template <class T>
-SIMD_FORCE_INLINE const T& GEN_clamped(const T& a, const T& lb, const T& ub) 
+SIMD_FORCE_INLINE const T& btClamped(const T& a, const T& lb, const T& ub) 
 {
 	return a < lb ? lb : (ub < a ? ub : a); 
 }
@@ -56,7 +56,7 @@ SIMD_FORCE_INLINE void btSetMax(T& a, const T& b)
 }
 
 template <class T>
-SIMD_FORCE_INLINE void GEN_clamp(T& a, const T& lb, const T& ub) 
+SIMD_FORCE_INLINE void btClamp(T& a, const T& lb, const T& ub) 
 {
 	if (a < lb) 
 	{
diff --git a/src/LinearMath/btPoolAllocator.h b/src/LinearMath/btPoolAllocator.h
index 39d2559c7..e3b4be9b1 100755
--- a/src/LinearMath/btPoolAllocator.h
+++ b/src/LinearMath/btPoolAllocator.h
@@ -57,6 +57,11 @@ public:
 		return m_freeCount;
 	}
 
+	int getUsedCount() const
+	{
+		return m_maxElements - m_freeCount;
+	}
+
 	void*	allocate(int size)
 	{
 		// release mode fix
@@ -96,6 +101,15 @@ public:
 		return m_elemSize;
 	}
 
+	unsigned char*	getPoolAddress()
+	{
+		return m_pool;
+	}
+
+	const unsigned char*	getPoolAddress() const
+	{
+		return m_pool;
+	}
 
 };
 
diff --git a/src/LinearMath/btScalar.h b/src/LinearMath/btScalar.h
index 9f6ceb3e0..f5694e0af 100644
--- a/src/LinearMath/btScalar.h
+++ b/src/LinearMath/btScalar.h
@@ -93,7 +93,7 @@ inline int	btGetVersion()
 #else
 	
 #if defined	(__CELLOS_LV2__)
-		#define SIMD_FORCE_INLINE inline
+		#define SIMD_FORCE_INLINE inline __attribute__((always_inline))
 		#define ATTRIBUTE_ALIGNED16(a) a __attribute__ ((aligned (16)))
 		#define ATTRIBUTE_ALIGNED64(a) a __attribute__ ((aligned (64)))
 		#define ATTRIBUTE_ALIGNED128(a) a __attribute__ ((aligned (128)))
@@ -101,7 +101,14 @@ inline int	btGetVersion()
 		#include <assert.h>
 		#endif
 #ifdef BT_DEBUG
-		#define btAssert assert
+#ifdef __SPU__
+#include <spu_printf.h>
+#define printf spu_printf
+	#define btAssert(x) {if(!(x)){printf("Assert "__FILE__ ":%u ("#x")\n", __LINE__);spu_hcmpeq(0,0);}}
+#else
+	#define btAssert assert
+#endif
+	
 #else
 		#define btAssert(x)
 #endif
diff --git a/src/LinearMath/btVector3.h b/src/LinearMath/btVector3.h
index 84446f229..f1f0be45c 100644
--- a/src/LinearMath/btVector3.h
+++ b/src/LinearMath/btVector3.h
@@ -667,23 +667,32 @@ SIMD_FORCE_INLINE void	btUnSwapVector3Endian(btVector3& vector)
 	vector = swappedVec;
 }
 
-SIMD_FORCE_INLINE void btPlaneSpace1 (const btVector3& n, btVector3& p, btVector3& q)
+template <class T>
+SIMD_FORCE_INLINE void btPlaneSpace1 (const T& n, T& p, T& q)
 {
-  if (btFabs(n.z()) > SIMDSQRT12) {
+  if (btFabs(n[2]) > SIMDSQRT12) {
     // choose p in y-z plane
     btScalar a = n[1]*n[1] + n[2]*n[2];
     btScalar k = btRecipSqrt (a);
-    p.setValue(0,-n[2]*k,n[1]*k);
+    p[0] = 0;
+	p[1] = -n[2]*k;
+	p[2] = n[1]*k;
     // set q = n x p
-    q.setValue(a*k,-n[0]*p[2],n[0]*p[1]);
+    q[0] = a*k;
+	q[1] = -n[0]*p[2];
+	q[2] = n[0]*p[1];
   }
   else {
     // choose p in x-y plane
-    btScalar a = n.x()*n.x() + n.y()*n.y();
+    btScalar a = n[0]*n[0] + n[1]*n[1];
     btScalar k = btRecipSqrt (a);
-    p.setValue(-n.y()*k,n.x()*k,0);
+    p[0] = -n[1]*k;
+	p[1] = n[0]*k;
+	p[2] = 0;
     // set q = n x p
-    q.setValue(-n.z()*p.y(),n.z()*p.x(),a*k);
+    q[0] = -n[2]*p[1];
+	q[1] = n[2]*p[0];
+	q[2] = a*k;
   }
 }
 
diff --git a/src/MiniCL/cl_MiniCL_Defs.h b/src/MiniCL/cl_MiniCL_Defs.h
index ffdac1026..285f48b64 100644
--- a/src/MiniCL/cl_MiniCL_Defs.h
+++ b/src/MiniCL/cl_MiniCL_Defs.h
@@ -22,7 +22,7 @@ subject to the following restrictions:
 
 #define __kernel
 #define __global
-#define __local
+#define __local static
 #define get_global_id(a)	__guid_arg
 #define get_local_id(a)		((__guid_arg) % gMiniCLNumOutstandingTasks)
 #define get_local_size(a)	(gMiniCLNumOutstandingTasks)